bench : print system info before ctx check

stream : add nullptr check of whisper_context (#3283 )
* stream : add nullptr check of whisper_context This commit adds a check to ensure that the `whisper_context` is not null after initialization. The motivation for this is that currently, if the initialization fails, the program continues to run leading to a segmentation fault. This sort of check is performed by others examples like whisper-cli. Refs: https://github.com/ggml-org/whisper.cpp/issues/3280#issuecomment-3003778035 * examples : add nullptr check for whisper_context
2025-08-10 05:38:07 +02:00 · 2025-06-25 16:01:32 +03:00 · 2025-06-25 14:16:31 +02:00 · 2025-06-25 12:12:36 +02:00 · 2025-06-25 06:35:38 +02:00 · 2025-06-24 09:24:27 +02:00
360 changed files with 61520 additions and 28341 deletions
--- a/.devops/main-cuda.Dockerfile
+++ b/.devops/main-cuda.Dockerfile
@ -16,6 +16,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}

 RUN apt-get update && \
    apt-get install -y build-essential libsdl2-dev wget cmake git \
+    && apt-get clean \
    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*

 # Ref: https://stackoverflow.com/a/53464012
@ -26,6 +27,12 @@ COPY .. .
 # Enable cuBLAS
 RUN make base.en CMAKE_ARGS="-DGGML_CUDA=1"

+RUN find /app/build -name "*.o" -delete && \
+    find /app/build -name "*.a" -delete && \
+    rm -rf /app/build/CMakeFiles && \
+    rm -rf /app/build/cmake_install.cmake && \
+    rm -rf /app/build/_deps
+
 FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
 ENV CUDA_MAIN_VERSION=12.3
 ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
@ -33,8 +40,11 @@ WORKDIR /app

 RUN apt-get update && \
  apt-get install -y curl ffmpeg wget cmake git \
+  && apt-get clean \
  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*

 COPY --from=build /app /app
+RUN du -sh /app/*
+RUN find /app -type f -size +100M
 ENV PATH=/app/build/bin:$PATH
 ENTRYPOINT [ "bash", "-c" ]
--- a/.devops/main-intel.Dockerfile
+++ b/.devops/main-intel.Dockerfile
@ -0,0 +1,28 @@
+ARG ONEAPI_VERSION=2025.1.1-0-devel-ubuntu24.04
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS build
+WORKDIR /app
+
+RUN apt-get update && \
+    apt-get install -y build-essential libsdl2-dev wget cmake git \
+    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+COPY .. .
+# Enable SYCL
+ARG GGML_SYCL_F16=OFF
+RUN if [ "${GGML_SYCL_F16}" = "ON" ]; then \
+        echo "GGML_SYCL_F16 is set" \
+        && export OPT_SYCL_F16="-DGGML_SYCL_F16=ON"; \
+    fi && \
+    make base.en CMAKE_ARGS="-DGGML_SYCL=1 -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ${OPT_SYCL_F16}"
+
+FROM intel/oneapi-basekit:$ONEAPI_VERSION AS runtime
+WORKDIR /app
+
+RUN apt-get update && \
+  apt-get install -y curl ffmpeg libsdl2-dev wget cmake git \
+  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+
+COPY --from=build /app /app
+ENV PATH=/app/build/bin:$PATH
+ENTRYPOINT [ "bash", "-c" ]
--- a/.devops/main-musa.Dockerfile
+++ b/.devops/main-musa.Dockerfile
@ -1,29 +1,39 @@
 ARG UBUNTU_VERSION=22.04
 # This needs to generally match the container host's environment.
-ARG MUSA_VERSION=rc3.1.1
+ARG MUSA_VERSION=rc4.0.1
 # Target the MUSA build image
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
 # Target the MUSA runtime image
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}

 FROM ${BASE_MUSA_DEV_CONTAINER} AS build
 WORKDIR /app

 RUN apt-get update && \
-    apt-get install -y build-essential libsdl2-dev wget cmake git \
-    && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+    apt-get install -y build-essential libsdl2-dev wget cmake git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* /tmp/* /var/tmp/*

 COPY .. .
 # Enable muBLAS
 RUN make base.en CMAKE_ARGS="-DGGML_MUSA=1"

+RUN find /app/build -name "*.o" -delete && \
+    find /app/build -name "*.a" -delete && \
+    rm -rf /app/build/CMakeFiles && \
+    rm -rf /app/build/cmake_install.cmake && \
+    rm -rf /app/build/_deps
+
 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
 WORKDIR /app

 RUN apt-get update && \
-  apt-get install -y curl ffmpeg wget cmake git \
-  && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
+    apt-get install -y curl ffmpeg wget cmake git && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* /tmp/* /var/tmp/*

 COPY --from=build /app /app
+RUN du -sh /app/*
+RUN find /app -type f -size +100M
 ENV PATH=/app/build/bin:$PATH
 ENTRYPOINT [ "bash", "-c" ]
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -101,6 +101,10 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
+            export DEBIAN_FRONTEND=noninteractive
+            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+
            apt update
            apt install -y build-essential libsdl2-dev cmake git
            cmake -B build
@ -129,6 +133,14 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
+            export DEBIAN_FRONTEND=noninteractive
+            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+
+            apt-get update
+            apt-get install -y ca-certificates
+            sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
+
            apt update
            apt install -y build-essential libsdl2-dev cmake git
            cmake -B build -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8-a
@ -157,6 +169,14 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
+            export DEBIAN_FRONTEND=noninteractive
+            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+
+            apt-get update
+            apt-get install -y ca-certificates
+            sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
+
            apt update
            apt install -y build-essential libsdl2-dev cmake git
            cmake -B build -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv7-a+fp
@ -242,6 +262,10 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
+            export DEBIAN_FRONTEND=noninteractive
+            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+
            apt update
            apt install -y build-essential cmake libsdl2-dev git
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
@ -272,6 +296,14 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
+            export DEBIAN_FRONTEND=noninteractive
+            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+
+            apt-get update
+            apt-get install -y ca-certificates
+            sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
+
            apt update
            apt install -y build-essential cmake libsdl2-dev git
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv8-a
@ -302,6 +334,14 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
+            export DEBIAN_FRONTEND=noninteractive
+            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+
+            apt-get update
+            apt-get install -y ca-certificates
+            sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
+
            apt update
            apt install -y build-essential cmake libsdl2-dev git
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DGGML_NATIVE=OFF -DGGML_CPU_ARM_ARCH=armv7-a+fp
@ -335,6 +375,14 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
+            export DEBIAN_FRONTEND=noninteractive
+            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+
+            apt-get update
+            apt-get install -y ca-certificates
+            sed -i "s|http://ports.ubuntu.com|https://mirror.kumi.systems|g" /etc/apt/sources.list
+
            apt update
            apt install -y clang build-essential cmake libsdl2-dev git
            cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
@ -365,6 +413,10 @@ jobs:
            -v ${{ github.workspace }}:/workspace \
            -w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
            set -e
+            export DEBIAN_FRONTEND=noninteractive
+            sed -i "s|archive.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+            sed -i "s|security.ubuntu.com|mirrors.kernel.org|g" /etc/apt/sources.list
+
            apt update
            apt install -y build-essential cmake git
            cmake . -DCMAKE_BUILD_TYPE=Debug \
@ -604,12 +656,19 @@ jobs:
          name: ggml_cpu_${{ matrix.arch }}.dll
          path: build/bin/${{ matrix.build }}/ggml-cpu.dll

+      - name: Pack bin artifacts
+        shell: pwsh
+        run: |
+              Compress-Archive -Path "build/bin/${{ matrix.build }}" -DestinationPath "whisper-bin-${{ matrix.arch }}.zip"
+
      - name: Upload binaries
-        if: matrix.sdl2 == 'ON'
+        if: matrix.sdl2 == 'ON' && ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
+                github.event.inputs.create_release == 'true' ||
+                github.event.inputs.pre_release_tag != '' }}
        uses: actions/upload-artifact@v4
        with:
-          name: whisper-bin-${{ matrix.arch }}
-          path: build/bin/${{ matrix.build }}
+          name: whisper-bin-${{ matrix.arch }}.zip
+          path: whisper-bin-${{ matrix.arch }}.zip

  windows-blas:
    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
@ -622,11 +681,14 @@ jobs:
        arch: [Win32, x64]
        blas: [ON]
        sdl2: [ON]
+        blasver: [0.3.29]
        include:
          - arch: Win32
            s2arc: x86
+            blasfile: x86
          - arch: x64
            s2arc: x64
+            blasfile: x64_64
          - sdl2: ON
            s2ver: 2.28.5

@ -647,7 +709,8 @@ jobs:
      - name: Install OpenBLAS and pkgconfiglite
        if: matrix.blas == 'ON'
        run: |
-          vcpkg install --triplet=${{ matrix.s2arc }}-windows openblas
+          Invoke-WebRequest "https://github.com/OpenMathLib/OpenBLAS/releases/download/v${{matrix.blasver}}/OpenBLAS-${{matrix.blasver}}_${{matrix.blasfile}}.zip" -OutFile "OpenBLAS-${{matrix.blasver}}.zip"
+          Expand-Archive "OpenBLAS-${{matrix.blasver}}.zip" -DestinationPath "OpenBLAS-${{matrix.blasver}}"
          choco install pkgconfiglite

      - name: Fetch SDL2 and set SDL2_DIR
@ -664,6 +727,8 @@ jobs:
          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
          -DGGML_BLAS=${{ matrix.blas }}
          -DGGML_BLAS_VENDOR=OpenBLAS
+          -DBLAS_LIBRARIES="$env:GITHUB_WORKSPACE/OpenBLAS-${{matrix.blasver}}/lib/libopenblas.lib"
+          -DBLAS_INCLUDE_DIRS="$env:GITHUB_WORKSPACE/OpenBLAS-${{matrix.blasver}}/include"
          -DWHISPER_SDL2=${{ matrix.sdl2 }}

      - name: Build
@ -673,30 +738,38 @@ jobs:

      - name: Copy openblas.dll
        if: matrix.blas == 'ON'
-        run: copy "C:/vcpkg/packages/openblas_${{ matrix.s2arc }}-windows/bin/openblas.dll" build/bin/${{ matrix.build }}
+        run: copy "$env:GITHUB_WORKSPACE/OpenBLAS-${{matrix.blasver}}/bin/libopenblas.dll" build/bin/${{ matrix.build }}

      - name: Copy SDL2.dll
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}

+      - name: Pack bin artifacts
+        shell: pwsh
+        run: |
+              Compress-Archive -Path "build/bin/${{ matrix.build }}" -DestinationPath "whisper-blas-bin-${{ matrix.arch }}.zip"
+
      - name: Upload binaries
-        if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
+        if: matrix.blas == 'ON' && matrix.sdl2 == 'ON' && ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
+                github.event.inputs.create_release == 'true' ||
+                github.event.inputs.pre_release_tag != '' }}
        uses: actions/upload-artifact@v4
        with:
-          name: whisper-blas-bin-${{ matrix.arch }}
-          path: build/bin/${{ matrix.build }}
+          name: whisper-blas-bin-${{ matrix.arch }}.zip
+          path: whisper-blas-bin-${{ matrix.arch }}.zip

  windows-cublas:
    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
            github.event.inputs.run_type == 'full-ci' }}
-    runs-on: windows-2019
+    runs-on: windows-2022
    strategy:
+      fail-fast: false
      matrix:
        build: [Release]
        arch: [x64]
        cublas: [ON]
        sdl2: [ON]
-        cuda-toolkit: [12.2.0, 11.8.0]
+        cuda-toolkit: [12.4.0, 11.8.0]
        include:
          - arch: x64
            sdl2: ON
@ -764,7 +837,7 @@ jobs:
          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y

          # Visual Studio integration
-          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\v160\BuildCustomizations" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\v170\BuildCustomizations" /E /I /H /Y

          # Set environment variables
          echo "$CUDA_TOOLKIT_DIR\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
@ -772,23 +845,23 @@ jobs:
          echo "CUDA_PATH=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
          echo "CUDA_PATH_V11_8=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
        
-      - name: Install Cuda Toolkit 12.2.0
-        if: ${{ matrix.cuda-toolkit == '12.2.0' }}
+      - name: Install Cuda Toolkit 12.4.0
+        if: ${{ matrix.cuda-toolkit == '12.4.0' }}
        run: |
          $CUDA_VERSION = ${{ matrix.cuda-toolkit }}
          $CUDA_TOOLKIT_DIR = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$CUDA_VERSION"
          $CUDA_DOWNLOAD = "https://developer.download.nvidia.com/compute/cuda/redist"

          # Components versions
-          $CUDART_VER   = "12.2.140"
-          $NVCC_VER     = "12.2.140"
-          $NVRTC_VER    = "12.2.140"
-          $CUBLAS_VER   = "12.2.5.6"
-          $NVTX_VER     = "12.2.140"
-          $PROFILER_VER = "12.2.140"
-          $VS_VER       = "12.2.140"
-          $NVPROF_VER   = "12.2.142"
-          $CCCL_VER     = "12.2.140"
+          $CUDART_VER   = "12.4.127"
+          $NVCC_VER     = "12.4.131"
+          $NVRTC_VER    = "12.4.127"
+          $CUBLAS_VER   = "12.4.5.8"
+          $NVTX_VER     = "12.4.127"
+          $PROFILER_VER = "12.4.127"
+          $VS_VER       = "12.4.127"
+          $NVPROF_VER   = "12.4.128"
+          $CCCL_VER     = "12.4.127"

          # Create the directory where the CUDA Toolkit will be installed
          mkdir -p $CUDA_TOOLKIT_DIR
@ -822,7 +895,7 @@ jobs:
          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y

          # Visual Studio integration
-          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\v160\BuildCustomizations" /E /I /H /Y
+          xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Microsoft\VC\v170\BuildCustomizations" /E /I /H /Y

          # Set environment variables
          echo "$CUDA_TOOLKIT_DIR\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
@ -850,14 +923,21 @@ jobs:
      - name: Build Project
        shell: cmd
        run: |
-          call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+          call "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
          cmake --version
          where cmake
+          if "${{ matrix.cuda-toolkit }}" == "11.8.0" (
+            set CUDA_FLAGS=-allow-unsupported-compiler -D_ALLOW_COMPILER_AND_STL_VERSION_MISMATCH -D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR
+          ) else (
+            set CUDA_FLAGS=
+          )
          cmake -S . -B build -G "Ninja Multi-Config" ^
            -DCMAKE_BUILD_TYPE=${{ matrix.build }} ^
            -DGGML_CUDA=${{ matrix.cublas }} ^
            -DWHISPER_SDL2=${{ matrix.sdl2 }} ^
-            -DSDL2_DIR="%SDL2_DIR%"
+            -DSDL2_DIR="%SDL2_DIR%" ^
+            -DCMAKE_POLICY_VERSION_MINIMUM=3.5 ^
+            -DCMAKE_CUDA_FLAGS="%CUDA_FLAGS%"
          set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
          cmake --build build --config ${{ matrix.build }} -j %NUMBER_OF_PROCESSORS%

@ -874,11 +954,19 @@ jobs:
        if: matrix.sdl2 == 'ON'
        run: copy "$env:SDL2_DIR/../lib/${{ matrix.arch }}/SDL2.dll" build/bin/${{ matrix.build }}

+      - name: Pack bin artifacts
+        shell: pwsh
+        run: |
+              Compress-Archive -Path "build/bin/${{ matrix.build }}" -DestinationPath "whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}.zip"
+
      - name: Upload binaries
+        if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
+                github.event.inputs.create_release == 'true' ||
+                github.event.inputs.pre_release_tag != '' }}
        uses: actions/upload-artifact@v4
        with:
-          name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}
-          path: build/bin/${{ matrix.build }}
+          name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}.zip
+          path: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}.zip

  emscripten:
    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
@ -964,7 +1052,7 @@ jobs:
        uses: actions/upload-artifact@v4
        with:
          path: whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework.zip
-          name: whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework
+          name: whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework.zip

  android:
    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
@ -1098,11 +1186,16 @@ jobs:
          chmod +x ./gradlew
          ./gradlew build --info

+      - name: Pack jar artifacts
+        shell: pwsh
+        run: |
+              Compress-Archive -Path "bindings/java/build/libs/whispercpp-*.jar" -DestinationPath "whispercpp.jar.zip"
+
      - name: Upload jar
        uses: actions/upload-artifact@v4
        with:
-          name: whispercpp.jar
-          path: bindings/java/build/libs/whispercpp-*.jar
+          name: whispercpp.jar.zip
+          path: whispercpp.jar.zip

 #      - name: Publish package
 #        if: ${{ github.ref == 'refs/heads/master' }}
@ -1140,6 +1233,9 @@ jobs:
    needs:
      - determine-tag
      - ios-xcode-build
+      - windows
+      - windows-blas
+      - windows-cublas

    steps:
      - name: Clone
@ -1223,3 +1319,23 @@ jobs:
          source venv/bin/activate
          pip install ane_transformers openai-whisper coremltools
          ./models/generate-coreml-model.sh ${{ env.MODEL_NAME }}
+
+  vad:
+    if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
+            github.event.inputs.run_type == 'full-ci' }}
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Build
+        shell: bash
+        run: |
+          cmake -B build
+          cmake --build build --config Release
+
+      - name: Test
+        shell: bash
+        run: |
+          ctest -R ^test-vad$ --test-dir build --output-on-failure -VV
--- a/.github/workflows/docker.yml
+++ b/.github/workflows/docker.yml
@ -19,9 +19,8 @@ jobs:
        config:
          - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64" }
          - { tag: "main-musa", dockerfile: ".devops/main-musa.Dockerfile", platform: "linux/amd64" }
-          #TODO: the cuda image keeps failing - disable for now
-          #      https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
-          #- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
+          - { tag: "main-intel", dockerfile: ".devops/main-intel.Dockerfile", platform: "linux/amd64" }
+          - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }

    steps:
      - name: Check out the repo
--- a/.gitignore
+++ b/.gitignore
@ -14,6 +14,7 @@

 build/
 build-*/
+build_*/

 # SPM
 .build/
@ -49,6 +50,8 @@ extra/bench-gg.txt
 models/*.mlmodel
 models/*.mlmodelc
 models/*.mlpackage
+models/*-encoder-openvino.xml
+models/*-encoder-openvino-cache/
 bindings/java/.gradle/
 bindings/java/.idea/
 .idea/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -59,9 +59,6 @@ option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
 # option list
 #

-# general
-option(WHISPER_CCACHE "whisper: use ccache if available" ON)
-
 # debug
 option(WHISPER_ALL_WARNINGS           "whisper: enable all compiler warnings"                   ON)
 option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
@ -96,7 +93,6 @@ option(WHISPER_OPENVINO              "whisper: support for OpenVINO"      OFF)
 include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)

 # override ggml options
-set(GGML_CCACHE             ${WHISPER_CCACHE})
 set(GGML_SANITIZE_THREAD    ${WHISPER_SANITIZE_THREAD})
 set(GGML_SANITIZE_ADDRESS   ${WHISPER_SANITIZE_ADDRESS})
 set(GGML_SANITIZE_UNDEFINED ${WHISPER_SANITIZE_UNDEFINED})
@ -121,6 +117,12 @@ whisper_option_depr(WARNING     WHISPER_OPENMP              GGML_OPENMP)
 whisper_option_depr(WARNING     WHISPER_RPC                 GGML_RPC)
 whisper_option_depr(WARNING     WHISPER_SYCL                GGML_SYCL)
 whisper_option_depr(WARNING     WHISPER_SYCL_F16            GGML_SYCL_F16)
+whisper_option_depr(WARNING     WHISPER_CCACHE              GGML_CCACHE)
+
+if (GGML_CUDA AND NOT MSVC)
+    #GGML_CUDA enabled, add the necessary compile options -Wno-deprecated-gpu-targets
+    add_compile_options(-Wno-deprecated-gpu-targets)
+endif()

 #
 # build the library
@ -226,10 +228,13 @@ if (MSVC)
        /wd4996  # Function or variable may be unsafe/deprecated
    )
    function(disable_msvc_warnings target_name)
-        target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
+        if(TARGET ${target_name})
+            target_compile_options(${target_name} PRIVATE ${MSVC_WARNING_FLAGS})
+        endif()
    endfunction()

    if (WHISPER_BUILD_EXAMPLES)
+        disable_msvc_warnings(whisper)
        disable_msvc_warnings(common)
        disable_msvc_warnings(common-sdl)
        disable_msvc_warnings(lsp)
@ -241,5 +246,6 @@ if (MSVC)
        disable_msvc_warnings(whisper-talk-llama)
        disable_msvc_warnings(whisper-bench)
        disable_msvc_warnings(quantize)
+        disable_msvc_warnings(vad-speech-segments)
    endif()
 endif()
--- a/4
+++ b/4
@ -48,10 +48,10 @@ tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 larg
 	@echo "Running $@ on all samples in ./samples ..."
 	@echo "==============================================="
 	@echo ""
-	@for f in samples/*$(.flac .mp3 .ogg .wav); do \
+	@for f in samples/*.{flac,mp3,ogg,wav}; do \
 		echo "----------------------------------------------" ; \
 		echo "[+] Running $@ on $$f ... (run 'ffplay $$f' to listen)" ; \
-	    echo "----------------------------------------------" ; \
+		echo "----------------------------------------------" ; \
 		echo "" ; \
 		./build/bin/whisper-cli -m models/ggml-$@.bin -f $$f ; \
 		echo "" ; \
--- a/README.md
+++ b/README.md
@ -25,6 +25,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - [Ascend NPU Support](#ascend-npu-support)
 - [Moore Threads GPU Support](#moore-threads-gpu-support)
 - [C-style API](https://github.com/ggml-org/whisper.cpp/blob/master/include/whisper.h)
+- [Voice Activity Detection (VAD)](#voice-activity-detection-vad)

 Supported platforms:

@ -34,7 +35,7 @@ Supported platforms:
 - [x] [Java](bindings/java/README.md)
 - [x] Linux / [FreeBSD](https://github.com/ggml-org/whisper.cpp/issues/56#issuecomment-1350920264)
 - [x] [WebAssembly](examples/whisper.wasm)
- [x] Windows ([MSVC](https://github.com/ggml-org/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggml-org/whisper.cpp/issues/168)]
+- [x] Windows ([MSVC](https://github.com/ggml-org/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggml-org/whisper.cpp/issues/168))
 - [x] [Raspberry Pi](https://github.com/ggml-org/whisper.cpp/discussions/166)
 - [x] [Docker](https://github.com/ggml-org/whisper.cpp/pkgs/container/whisper.cpp)

@ -266,7 +267,7 @@ This can result in significant speedup in encoder performance. Here are the inst

 - Build `whisper.cpp` with OpenVINO support:

-  Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2023.0.0](https://github.com/openvinotoolkit/openvino/releases/tag/2023.0.0).
+  Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2024.6.0](https://github.com/openvinotoolkit/openvino/releases/tag/2024.6.0). Ready to use Binaries of the required libraries can be found in the [OpenVino Archives](https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.6/)

  After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:

@ -385,7 +386,7 @@ Run the inference examples as usual, for example:
 ## Moore Threads GPU support

 With Moore Threads cards the processing of the models is done efficiently on the GPU via muBLAS and custom MUSA kernels.
-First, make sure you have installed `MUSA SDK rc3.1.1`: https://developer.mthreads.com/sdk/download/musa?equipment=&os=&driverVersion=&version=rc3.1.1
+First, make sure you have installed `MUSA SDK rc4.0.1`: https://developer.mthreads.com/sdk/download/musa?equipment=&os=&driverVersion=&version=4.0.1

 Now build `whisper.cpp` with MUSA support:

@ -599,7 +600,7 @@ main: processing './samples/a13.wav' (480000 samples, 30.0 sec), 4 threads, 1 pr
 ## Karaoke-style movie generation (experimental)

 The [whisper-cli](examples/cli) example provides support for output of karaoke-style movies, where the
-currently pronounced word is highlighted. Use the `-wts` argument and run the generated bash script.
+currently pronounced word is highlighted. Use the `-owts` argument and run the generated bash script.
 This requires to have `ffmpeg` installed.

 Here are a few _"typical"_ examples:
@ -732,6 +733,89 @@ let package = Package(
 )
 ```

+## Voice Activity Detection (VAD)
+Support for Voice Activity Detection (VAD) can be enabled using the `--vad`
+argument to `whisper-cli`. In addition to this option a VAD model is also
+required.
+
+The way this works is that first the audio samples are passed through
+the VAD model which will detect speech segments. Using this information the
+only the speech segments that are detected are extracted from the original audio
+input and passed to whisper for processing. This reduces the amount of audio
+data that needs to be processed by whisper and can significantly speed up the
+transcription process.
+
+The following VAD models are currently supported:
+
+### Silero-VAD
+[Silero-vad](https://github.com/snakers4/silero-vad) is a lightweight VAD model
+written in Python that is fast and accurate.
+
+Models can be downloaded by running the following command on Linux or MacOS:
+```console
+$ ./models/download-vad-model.sh silero-v5.1.2
+Downloading ggml model silero-v5.1.2 from 'https://huggingface.co/ggml-org/whisper-vad' ...
+ggml-silero-v5.1.2.bin        100%[==============================================>] 864.35K  --.-KB/s    in 0.04s
+Done! Model 'silero-v5.1.2' saved in '/path/models/ggml-silero-v5.1.2.bin'
+You can now use it like this:
+
+  $ ./build/bin/whisper-cli -vm /path/models/ggml-silero-v5.1.2.bin --vad -f samples/jfk.wav -m models/ggml-base.en.bin
+
+```
+And the following command on Windows:
+```console
+> .\models\download-vad-model.cmd silero-v5.1.2
+Downloading vad model silero-v5.1.2...
+Done! Model silero-v5.1.2 saved in C:\Users\danie\work\ai\whisper.cpp\ggml-silero-v5.1.2.bin
+You can now use it like this:
+
+C:\path\build\bin\Release\whisper-cli.exe -vm C:\path\ggml-silero-v5.1.2.bin --vad -m models/ggml-base.en.bin -f samples\jfk.wav
+
+```
+
+To see a list of all available models, run the above commands without any
+arguments.
+
+This model can be also be converted manually to ggml using the following command:
+```console
+$ python3 -m venv venv && source venv/bin/activate
+$ (venv) pip install silero-vad
+$ (venv) $ python models/convert-silero-vad-to-ggml.py --output models/silero.bin
+Saving GGML Silero-VAD model to models/silero-v5.1.2-ggml.bin
+```
+And it can then be used with whisper as follows:
+```console
+$ ./build/bin/whisper-cli \
+   --file ./samples/jfk.wav \
+   --model ./models/ggml-base.en.bin \
+   --vad \
+   --vad-model ./models/silero-v5.1.2-ggml.bin
+```
+
+### VAD Options
+
+* --vad-threshold: Threshold probability for speech detection. A probability
+for a speech segment/frame above this threshold will be considered as speech.
+
+* --vad-min-speech-duration-ms: Minimum speech duration in milliseconds. Speech
+segments shorter than this value will be discarded to filter out brief noise or
+false positives.
+
+* --vad-min-silence-duration-ms: Minimum silence duration in milliseconds. Silence
+periods must be at least this long to end a speech segment. Shorter silence
+periods will be ignored and included as part of the speech.
+
+* --vad-max-speech-duration-s: Maximum speech duration in seconds. Speech segments
+longer than this will be automatically split into multiple segments at silence
+points exceeding 98ms to prevent excessively long segments.
+
+* --vad-speech-pad-ms: Speech padding in milliseconds. Adds this amount of padding
+before and after each detected speech segment to avoid cutting off speech edges.
+
+* --vad-samples-overlap: Amount of audio to extend from each speech segment into
+the next one, in seconds (e.g., 0.10 = 100ms overlap). This ensures speech isn't
+cut off abruptly between segments when they're concatenated together.
+
 ## Examples

 There are various examples of using the library for different projects in the [examples](examples) folder.
--- a/README_sycl.md
+++ b/README_sycl.md
@ -16,13 +16,13 @@

 ## Background

-SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators<EFBFBD>such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.
+SYCL is a higher-level programming model to improve programming productivity on various hardware accelerators—such as CPUs, GPUs, and FPGAs. It is a single-source embedded domain-specific language based on pure C++17.

 oneAPI is a specification that is open and standards-based, supporting multiple architecture types including but not limited to GPU, CPU, and FPGA. The spec has both direct programming and API-based programming paradigms.

 Intel uses the SYCL as direct programming language to support CPU, GPUs and FPGAs.

-To avoid  re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel<EFBFBD> DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.
+To avoid  re-inventing the wheel, this code refers other code paths in llama.cpp (like OpenBLAS, cuBLAS, CLBlast). We use a open-source tool [SYCLomatic](https://github.com/oneapi-src/SYCLomatic) (Commercial release [Intel® DPC++ Compatibility Tool](https://www.intel.com/content/www/us/en/developer/tools/oneapi/dpc-compatibility-tool.html)) migrate to SYCL.

 The whisper.cpp for SYCL is used to support Intel GPUs.

@ -84,10 +84,10 @@ Platform #0: Intel(R) OpenCL HD Graphics
 `-- Device #0: Intel(R) Iris(R) Xe Graphics [0x9a49]
 ```

-2. Install Intel<EFBFBD> oneAPI Base toolkit.
+2. Install Intel® oneAPI Base toolkit.


-a. Please follow the procedure in [Get the Intel<EFBFBD> oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).
+a. Please follow the procedure in [Get the Intel® oneAPI Base Toolkit ](https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html).

 Recommend to install to default folder: **/opt/intel/oneapi**.

--- a/bindings/java/README.md
+++ b/bindings/java/README.md
@ -23,26 +23,42 @@ import io.github.ggerganov.whispercpp.WhisperCpp;
 public class Example {

    public static void main(String[] args) {
+        
        WhisperCpp whisper = new WhisperCpp();
-        // By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
-        // or you can provide the absolute path to the model file.
-        long context = whisper.initContext("base.en");
        try {
-            var whisperParams = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
+            // By default, models are loaded from ~/.cache/whisper/ and are usually named "ggml-${name}.bin"
+            // or you can provide the absolute path to the model file.
+            whisper.initContext("../ggml-base.en.bin"); 
+            WhisperFullParams.ByValue whisperParams = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH); 
+            
            // custom configuration if required      
-            whisperParams.temperature_inc = 0f;
+            //whisperParams.n_threads = 8;
+            whisperParams.temperature = 0.0f;
+            whisperParams.temperature_inc = 0.2f;
+            //whisperParams.language = "en";
                            
-            var samples = readAudio(); // divide each value by 32767.0f
-            whisper.fullTranscribe(whisperParams, samples);
+            float[] samples = readAudio(); // divide each value by 32767.0f
+            List<WhisperSegment> whisperSegmentList = whisper.fullTranscribeWithTime(whisperParams, samples);
+            
+            for (WhisperSegment whisperSegment : whisperSegmentList) {
+
+                long start = whisperSegment.getStart();
+                long end = whisperSegment.getEnd();
+
+                String text = whisperSegment.getSentence();
+                    
+                System.out.println("start: "+start);
+                System.out.println("end: "+end);
+                System.out.println("text: "+text);
                
-            int segmentCount = whisper.getTextSegmentCount(context);
-            for (int i = 0; i < segmentCount; i++) {
-                String text = whisper.getTextSegment(context, i);
-                System.out.println(segment.getText());
            }
+    
+        } catch (IOException e) {
+            e.printStackTrace();
        } finally {
-             whisper.freeContext(context);
+            whisper.close();
        }
+        
     }
 }
 ```
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCpp.java
@ -168,23 +168,26 @@ public class WhisperCpp implements AutoCloseable {
        return str.toString().trim();
    }

-    public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
+    /**
+     * Full transcribe with time list.
+     *
+     * @param whisperParams the whisper params
+     * @param audioData     the audio data
+     * @return the list
+     * @throws IOException the io exception
+     */
+    public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
        if (ctx == null) {
            throw new IllegalStateException("Model not initialised");
        }

-        WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
-            lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
-        valueParams.read();
-
-        if (lib.whisper_full(ctx, valueParams, audioData, audioData.length) != 0) {
+        if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
            throw new IOException("Failed to process audio");
        }

        int nSegments = lib.whisper_full_n_segments(ctx);
        List<WhisperSegment> segments= new ArrayList<>(nSegments);

-
        for (int i = 0; i < nSegments; i++) {
            long t0 = lib.whisper_full_get_segment_t0(ctx, i);
            String text = lib.whisper_full_get_segment_text(ctx, i);
--- a/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java
+++ b/bindings/java/src/test/java/io/github/ggerganov/whispercpp/WhisperCppTest.java
@ -118,7 +118,7 @@ class WhisperCppTest {
        float[] floats = new float[b.length / 2];

        //WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
-        WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
+        WhisperFullParams.ByValue params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
        params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
        params.print_progress = CBool.FALSE;
        //params.initial_prompt = "and so my fellow Americans um, like";
--- a/bindings/ruby/.gitignore
+++ b/bindings/ruby/.gitignore
@ -1,6 +1,9 @@
 LICENSE
 pkg/
 lib/whisper.*
-ext/sources/*
-!ext/sources/CMakeGraphVizOptions.cmake
-ext/mkmf.log
+ext/examples/
+ext/ggml/
+ext/include/
+ext/scripts/
+ext/src/
+test/fixtures/
--- a/bindings/ruby/README.md
+++ b/bindings/ruby/README.md
@ -24,7 +24,21 @@ or,

    $ gem install whispercpp -- --enable-ggml-cuda

-See whisper.cpp's [README](https://github.com/ggml-org/whisper.cpp/blob/master/README.md) for available options. You need convert options present the README to Ruby-style options.  
+See whisper.cpp's [README](https://github.com/ggml-org/whisper.cpp/blob/master/README.md) for available options. You need convert options present the README to Ruby-style options, for example:
+
+Boolean options:
+
+* `-DGGML_BLAS=1` -> `--enable-ggml-blas`
+* `-DWHISER_COREML=OFF` -> `--disable-whisper-coreml`
+
+Argument options:
+
+* `-DGGML_CUDA_COMPRESSION_MODE=size` -> `--ggml-cuda-compression-mode=size`
+
+Combination:
+
+* `-DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="86"` -> `--enable-ggml-cuda --cmake_cuda-architectures="86"`
+
 For boolean options like `GGML_CUDA`, the README says `-DGGML_CUDA=1`. You need strip `-D`, prepend `--enable-` for `1` or `ON` (`--disable-` for `0` or `OFF`) and make it kebab-case: `--enable-ggml-cuda`.  
 For options which require arguments like `CMAKE_CUDA_ARCHITECTURES`, the README says `-DCMAKE_CUDA_ARCHITECTURES="86"`. You need strip `-D`, prepend `--`, make it kebab-case, append `=` and append argument: `--cmake-cuda-architectures="86"`.

@ -56,17 +70,6 @@ end

 Some models are prepared up-front:

-```ruby
-base_en = Whisper::Model.pre_converted_models["base.en"]
-whisper = Whisper::Context.new(base_en)
-```
-
-At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
-
-```ruby
-Whisper::Model.pre_converted_models["base"].clear_cache
-```
-
 You also can use shorthand for pre-converted models:

 ```ruby
@ -91,6 +94,19 @@ puts Whisper::Model.pre_converted_models.keys
 #   :
 ```

+You can also retrieve each model:
+
+```ruby
+base_en = Whisper::Model.pre_converted_models["base.en"]
+whisper = Whisper::Context.new(base_en)
+```
+
+At first time you use a model, it is downloaded automatically. After that, downloaded cached file is used. To clear cache, call `#clear_cache`:
+
+```ruby
+Whisper::Model.pre_converted_models["base"].clear_cache
+```
+
 You can also use local model files you prepared:

 ```ruby
@ -111,9 +127,80 @@ See [models][] page for details.

 Currently, whisper.cpp accepts only 16-bit WAV files.

+### Voice Activity Detection (VAD) ###
+
+Support for Voice Activity Detection (VAD) can be enabled by setting `Whisper::Params`'s `vad` argument to `true` and specifying VAD model:
+
+```ruby
+Whisper::Params.new(
+  vad: true,
+  vad_model_path: "silero-v5.1.2",
+  # other arguments...
+)
+```
+
+When you pass the model name (`"silero-v5.1.2"`) or URI (`https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin`), it will be downloaded automatically.
+Currently, "silero-v5.1.2" is registered as pre-converted model like ASR models. You also specify file path or URI of model.
+
+If you need configure VAD behavior, pass params for that:
+
+```ruby
+Whisper::Params.new(
+  vad: true,
+  vad_model_path: "silero-v5.1.2",
+  vad_params: Whisper::VAD::Params.new(
+    threshold: 1.0, # defaults to 0.5
+    min_speech_duration_ms: 500, # defaults to 250
+    min_silence_duration_ms: 200, # defaults to 100
+    max_speech_duration_s: 30000, # default is FLT_MAX,
+    speech_pad_ms: 50, # defaults to 30
+    samples_overlap: 0.5 # defaults to 0.1
+  ),
+  # other arguments...
+)
+```
+
+For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
+
+### Output ###
+
+whispercpp supports SRT and WebVTT output:
+
+```ruby
+puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
+# =>
+WEBVTT
+
+1
+00:00:00.000 --> 00:00:03.860
+ My thought I have nobody by a beauty and will as you poured.
+
+2
+00:00:03.860 --> 00:00:09.840
+ Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in
+
+3
+00:00:09.840 --> 00:00:09.940
+ a
+
+```
+
+You may call `#to_srt`, too
+
+
 API
 ---

+### Transcription ###
+
+By default, `Whisper::Context#transcribe` works in a single thread. You can make it work in parallel by passing `n_processors` option:
+
+```ruby
+whisper.transcribe("path/to/audio.wav", params, n_processors: Etc.nprocessors)
+```
+
+Note that transcription occasionally might be low accuracy when it works in parallel.
+
 ### Segments ###

 Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
@ -135,7 +222,7 @@ whisper
      ed: format_time(segment.end_time),
      text: segment.text
    }
-    line << " (speaker turned)" if segment.speaker_next_turn?
+    line << " (speaker turned)" if segment.speaker_turn_next?
    puts line
  end

@ -151,7 +238,7 @@ params.on_new_segment do |segment|
    ed: format_time(segment.end_time),
    text: segment.text
  }
-  line << " (speaker turned)" if segment.speaker_next_turn?
+  line << " (speaker turned)" if segment.speaker_turn_next?
  puts line
 end

@ -248,6 +335,11 @@ First call of `rake test` builds an extension and downloads a model for testing.

 If something seems wrong on build, running `rake clean` solves some cases.

+### Need help ###
+
+* Windows support
+* Refinement of C/C++ code, especially memory management
+
 License
 -------

--- a/bindings/ruby/Rakefile
+++ b/bindings/ruby/Rakefile
@ -67,17 +67,30 @@ file LIB_FILE => [SO_FILE, "lib"] do |t|
 end
 CLEAN.include LIB_FILE

-Rake::TestTask.new do |t|
-  t.test_files = FileList["tests/test_*.rb"]
+Rake::TestTask.new
+
+TEST_FIXTURE_AUDIO = "test/fixtures/jfk.wav"
+TEST_FIXTURE_AUDIO_SRC = File.expand_path(File.join(__dir__, "..", "..", "samples", "jfk.wav"))
+TEST_FIXTURE_AUDIO_DIR = TEST_FIXTURE_AUDIO.pathmap("%d")
+directory TEST_FIXTURE_AUDIO_DIR
+if File.exist? TEST_FIXTURE_AUDIO_SRC
+  file TEST_FIXTURE_AUDIO => [TEST_FIXTURE_AUDIO_SRC, TEST_FIXTURE_AUDIO_DIR] do |t|
+    symlink t.source, t.name
+  end
+else
+  require "open-uri"
+  file TEST_FIXTURE_AUDIO => TEST_FIXTURE_AUDIO_DIR do |t|
+    File.write t.name, URI("https://github.com/ggml-org/whisper.cpp/raw/refs/heads/master/samples/jfk.wav").read
+  end
 end

-TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
-file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t|
-  chdir "tests/jfk_reader" do
+TEST_MEMORY_VIEW = "test/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
+file TEST_MEMORY_VIEW => "test/jfk_reader/jfk_reader.c" do |t|
+  chdir "test/jfk_reader" do
    ruby "extconf.rb"
    sh "make"
  end
 end
-CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
+CLEAN.include TEST_MEMORY_VIEW

-task test: [LIB_FILE, TEST_MEMORY_VIEW]
+task test: [LIB_FILE, TEST_MEMORY_VIEW, TEST_FIXTURE_AUDIO]
--- a/bindings/ruby/ext/.gitignore
+++ b/bindings/ruby/ext/.gitignore
@ -2,10 +2,8 @@ Makefile
 whisper.so
 whisper.bundle
 whisper.dll
-scripts/get-flags.mk
 *.o
-/*/**/*.c
-/*/**/*.cpp
-/*/**/*.h
-/*/**/*.m
-/*/**/*.metal
+*.a
+sources/*
+!sources/CMakeGraphVizOptions.cmake
+mkmf.log
--- a/bindings/ruby/ext/dependencies.rb
+++ b/bindings/ruby/ext/dependencies.rb
@ -1,16 +1,32 @@
 require "tsort"

 class Dependencies
+  include TSort
+
  def initialize(cmake, options)
    @cmake = cmake
    @options = options
+    @static_lib_shape = nil
+    @nodes = {}
+    @graph = Hash.new {|h, k| h[k] = []}

    generate_dot
-    @libs = parse_dot
+    parse_dot
+  end
+
+  def libs
+    tsort.filter_map {|node|
+      label, shape = @nodes[node]
+      if shape == @static_lib_shape
+        label.gsub(/\\n\([^)]+\)/, '')
+      else
+        nil
+      end
+    }.reverse.collect {|lib| "lib#{lib}.a"}
  end

  def to_s
-    @libs.join(" ")
+    libs.join(" ")
  end

  private
@ -20,42 +36,38 @@ class Dependencies
  end

  def generate_dot
-    system @cmake, "-S", "sources", "-B", "build", "--graphviz", dot_path, "-D", "BUILD_SHARED_LIBS=OFF", @options.to_s, exception: true
+    args = ["-S", "sources", "-B", "build", "--graphviz", dot_path, "-D", "BUILD_SHARED_LIBS=OFF"]
+    args << @options.to_s unless @options.to_s.empty?
+    system @cmake, *args, exception: true
  end

  def parse_dot
-    static_lib_shape = nil
-    nodes = {}
-    depends = Hash.new {|h, k| h[k] = []}
-
-    class << depends
-      include TSort
-      alias tsort_each_node each_key
-      def tsort_each_child(node, &block)
-        fetch(node, []).each(&block)
-      end
-    end
-
    File.open(dot_path).each_line do |line|
      case line
      when /\[\s*label\s*=\s*"Static Library"\s*,\s*shape\s*=\s*(?<shape>\w+)\s*\]/
-        static_lib_shape = $~[:shape]
+        @static_lib_shape = $~[:shape]
      when /\A\s*"(?<node>\w+)"\s*\[\s*label\s*=\s*"(?<label>\S+)"\s*,\s*shape\s*=\s*(?<shape>\w+)\s*\]\s*;\s*\z/
        node = $~[:node]
        label = $~[:label]
        shape = $~[:shape]
-        nodes[node] = [label, shape]
+        @nodes[node] = [label, shape]
      when /\A\s*"(?<depender>\w+)"\s*->\s*"(?<dependee>\w+)"/
        depender = $~[:depender]
        dependee = $~[:dependee]
-        depends[depender] ||= []
-        depends[depender] << dependee
+        @graph[depender] << dependee
      end
    end
-    depends.tsort.filter_map {|node|
-      label, shape = nodes[node]
-      shape == static_lib_shape ? label : nil
-    }.collect {|lib| "lib#{lib}.a"}
-      .reverse
+  end
+
+  def tsort_each_node
+    @nodes.each_key do |node|
+      yield node
+    end
+  end
+
+  def tsort_each_child(node)
+    @graph[node].each do |child|
+      yield child
+    end
  end
 end
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@ -3,7 +3,7 @@ require_relative "options"
 require_relative "dependencies"

 cmake = find_executable("cmake") || abort
-options = Options.new
+options = Options.new(cmake)
 have_library("gomp") rescue nil
 libs = Dependencies.new(cmake, options)

--- a/bindings/ruby/ext/options.rb
+++ b/bindings/ruby/ext/options.rb
@ -1,25 +1,11 @@
 class Options
-  def initialize
+  def initialize(cmake="cmake")
+    @cmake = cmake
    @options = {}
-    @pending_options = []
-    @ignored_options = []

    configure
  end

-  def help
-    @options
-      .collect_concat {|name, (type, value)|
-        option = option_name(name)
-        if type == :bool
-          ["--enable-#{option}", "--disable-#{option}"]
-        else
-          "--#{option}=#{type.upcase}"
-        end
-      }
-      .join($/)
-  end
-
  def to_s
    @options
      .reject {|name, (type, value)| value.nil?}
@ -32,188 +18,65 @@ class Options

    output = nil
    Dir.chdir __dir__ do
-      output = `cmake -S sources -B build -L`
+      output = `#{@cmake.shellescape} -S sources -B build -L`
    end
-    started = false
-    @cmake_options = output.lines.filter_map {|line|
-      if line.chomp == "-- Cache values"
-        started = true
-        next
-      end
-      next unless started
-      option, value = line.chomp.split("=", 2)
-      name, type = option.split(":", 2)
-      [name, type, value]
-    }
-  end
-
-  def missing_options
-    cmake_options.collect {|name, type, value| name} -
-      @options.keys - @pending_options - @ignored_options
-  end
-
-  def extra_options
-    @options.keys + @pending_options - @ignored_options -
-      cmake_options.collect {|name, type, value| name}
+    @cmake_options = output.lines.drop_while {|line| line.chomp != "-- Cache values"}.drop(1)
+                       .filter_map {|line|
+                         option, value = line.chomp.split("=", 2)
+                         name, type = option.split(":", 2)
+                         [
+                           name,
+                           [
+                             type,
+                             type == "BOOL" ? value == "ON" : value
+                           ]
+                         ]
+                       }.to_h
  end

  private

  def configure
-    filepath "ACCELERATE_FRAMEWORK"
-    ignored "BUILD_SHARED_LIBS"
-    ignored "BUILD_TESTING"
-    ignored "CMAKE_BUILD_TYPE"
-    ignored "CMAKE_INSTALL_PREFIX"
-    string "CMAKE_OSX_ARCHITECTURES"
-    ignored "CMAKE_OSX_DEPLOYMENT_TARGET"
-    string "CMAKE_OSX_SYSROOT"
-    filepath "FOUNDATION_LIBRARY"
-    bool "GGML_ACCELERATE"
-    bool "GGML_ALL_WARNINGS_3RD_PARTY"
-    bool "GGML_AMX_BF16"
-    bool "GGML_AMX_INT8"
-    bool "GGML_AMX_TILE"
-    bool "GGML_AVX"
-    bool "GGML_AVX2"
-    bool "GGML_AVX512"
-    bool "GGML_AVX512_BF16"
-    bool "GGML_AVX512_VBMI"
-    bool "GGML_AVX512_VNNI"
-    bool "GGML_AVX_VNNI"
-    ignored "GGML_BACKEND_DL"
-    ignored "GGML_BIN_INSTALL_DIR"
-    bool "GGML_BLAS"
-    string "GGML_BLAS_VENDOR"
-    bool "GGML_BMI2"
-    ignored "GGML_BUILD_EXAMPLES"
-    ignored "GGML_BUILD_TESTS"
-    filepath "GGML_CCACHE_FOUND"
-    bool "GGML_CPU"
-    bool "GGML_CPU_AARCH64"
-    ignored "GGML_CPU_ALL_VARIANTS"
-    string "GGML_CPU_ARM_ARCH"
-    bool "GGML_CPU_HBM"
-    bool "GGML_CPU_KLEIDIAI"
-    string "GGML_CPU_POWERPC_CPUTYPE"
-    bool "GGML_CUDA"
-    string "GGML_CUDA_COMPRESSION_MODE"
-    bool "GGML_CUDA_F16"
-    bool "GGML_CUDA_FA"
-    bool "GGML_CUDA_FA_ALL_QUANTS"
-    bool "GGML_CUDA_FORCE_CUBLAS"
-    bool "GGML_CUDA_FORCE_MMQ"
-    ignored "GGML_CUDA_GRAPHS"
-    bool "GGML_CUDA_NO_PEER_COPY"
-    bool "GGML_CUDA_NO_VMM"
-    string "GGML_CUDA_PEER_MAX_BATCH_SIZE"
-    bool "GGML_F16C"
-    bool "GGML_FMA"
-    bool "GGML_GPROF"
-    bool "GGML_HIP"
-    bool "GGML_HIP_GRAPHS"
-    bool "GGML_HIP_NO_VMM"
-    bool "GGML_HIP_ROCWMMA_FATTN"
-    ignored "GGML_INCLUDE_INSTALL_DIR"
-    bool "GGML_KOMPUTE"
-    bool "GGML_LASX"
-    ignored "GGML_LIB_INSTALL_DIR"
-    ignored "GGML_LLAMAFILE"
-    bool "GGML_LSX"
-    bool "GGML_LTO"
-    bool "GGML_METAL"
-    bool "GGML_METAL_EMBED_LIBRARY"
-    string "GGML_METAL_MACOSX_VERSION_MIN"
-    bool "GGML_METAL_NDEBUG"
-    bool "GGML_METAL_SHADER_DEBUG"
-    string "GGML_METAL_STD"
-    bool "GGML_METAL_USE_BF16"
-    bool "GGML_MUSA"
-    bool "GGML_NATIVE"
-    bool "GGML_OPENCL"
-    bool "GGML_OPENCL_EMBED_KERNELS"
-    bool "GGML_OPENCL_PROFILING"
-    string "GGML_OPENCL_TARGET_VERSION"
-    bool "GGML_OPENCL_USE_ADRENO_KERNELS"
-    bool "GGML_OPENMP"
-    bool "GGML_RPC"
-    bool "GGML_RVV"
-    bool "GGML_RV_ZFH"
-    pending "GGML_SCCACHE_FOUND"
-    string "GGML_SCHED_MAX_COPIES"
-    bool "GGML_SSE42"
-    ignored "GGML_STATIC"
-    bool "GGML_SYCL"
-    string "GGML_SYCL_DEVICE_ARCH"
-    bool "GGML_SYCL_F16"
-    bool "GGML_SYCL_GRAPH"
-    string "GGML_SYCL_TARGET"
-    bool "GGML_VULKAN"
-    bool "GGML_VULKAN_CHECK_RESULTS"
-    bool "GGML_VULKAN_DEBUG"
-    bool "GGML_VULKAN_MEMORY_DEBUG"
-    bool "GGML_VULKAN_PERF"
-    ignored "GGML_VULKAN_RUN_TESTS"
-    filepath "GGML_VULKAN_SHADERS_GEN_TOOLCHAIN"
-    bool "GGML_VULKAN_SHADER_DEBUG_INFO"
-    pending "GGML_VULKAN_VALIDATE"
-    bool "GGML_VXE"
-    filepath "GIT_EXE"
-    filepath "MATH_LIBRARY"
-    filepath "METALKIT_FRAMEWORK"
-    filepath "METAL_FRAMEWORK"
-    bool "WHISPER_ALL_WARNINGS"
-    bool "WHISPER_ALL_WARNINGS_3RD_PARTY"
-    ignored "WHISPER_BIN_INSTALL_DIR"
-    ignored "WHISPER_BUILD_EXAMPLES"
-    ignored "WHISPER_BUILD_SERVER"
-    ignored"WHISPER_BUILD_TESTS"
-    bool "WHISPER_CCACHE"
-    bool "WHISPER_COREML"
-    bool "WHISPER_COREML_ALLOW_FALLBACK"
-    ignored "WHISPER_CURL"
-    bool "WHISPER_FATAL_WARNINGS"
-    ignored "WHISPER_FFMPEG"
-    ignored "WHISPER_INCLUDE_INSTALL_DIR"
-    ignored "WHISPER_LIB_INSTALL_DIR"
-    bool "WHISPER_OPENVINO"
-    bool "WHISPER_SANITIZE_ADDRESS"
-    bool "WHISPER_SANITIZE_THREAD"
-    bool "WHISPER_SANITIZE_UNDEFINED"
-    ignored "WHISPER_SDL2"
-    pending "WHISPER_USE_SYSTEM_GGML"
+    cmake_options.each_pair do |name, (type, default_value)|
+      option = option_name(name)
+      value = type == "BOOL" ? enable_config(option) : arg_config("--#{option}")
+      @options[name] = [type, value]
+    end
+
+    configure_accelerate
+    configure_metal
+    configure_coreml
+  end
+
+  # See ggml/src/ggml-cpu/CMakeLists.txt
+  def configure_accelerate
+    if RUBY_PLATFORM.match?(/darwin/) && enabled?("GGML_ACCELERATE")
+      $LDFLAGS << " -framework Accelerate"
+    end
+  end
+
+  # See ggml/src/ggml-metal/CMakeLists.txt
+  def configure_metal
+    $LDFLAGS << " -framework Foundation -framework Metal -framework MetalKit" if enabled?("GGML_METAL")
+  end
+
+  # See src/CmakeLists.txt
+  def configure_coreml
+    if enabled?("WHISPER_COREML")
+      $LDFLAGS << " -framework Foundation -framework CoreML"
+      $CPPFLAGS << " -DRUBY_WHISPER_USE_COREML"
+    end
  end

  def option_name(name)
    name.downcase.gsub("_", "-")
  end

-  def bool(name)
-    option = option_name(name)
-    value = enable_config(option)
-    @options[name] = [:bool, value]
-  end
-
-  def string(name, type=:string)
-    option = "--#{option_name(name)}"
-    value = arg_config(option)
-    raise "String expected for #{option}" if value == true || value&.empty?
-    @options[name] = [type, value]
-  end
-
-  def path(name)
-    string(name, :path)
-  end
-
-  def filepath(name)
-    string(name, :filepath)
-  end
-
-  def pending(name)
-    @pending_options << name
-  end
-
-  def ignored(name)
-    @ignored_options << name
+  def enabled?(option)
+    if @options[option][1].nil?
+      cmake_options[option][1]
+    else
+      @options[option][1]
+    end
  end
 end
--- a/bindings/ruby/ext/ruby_whisper.c
+++ b/bindings/ruby/ext/ruby_whisper.c
@ -3,8 +3,10 @@
 #include "ruby_whisper.h"

 VALUE mWhisper;
+VALUE mVAD;
 VALUE cContext;
 VALUE cParams;
+VALUE cVADParams;
 VALUE eError;

 VALUE cSegment;
@ -20,6 +22,9 @@ ID id_new;
 ID id_to_path;
 ID id_URI;
 ID id_pre_converted_models;
+ID id_coreml_compiled_models;
+ID id_cache;
+ID id_n_processors;

 static bool is_log_callback_finalized = false;

@ -31,6 +36,7 @@ extern void init_ruby_whisper_params(VALUE *mWhisper);
 extern void init_ruby_whisper_error(VALUE *mWhisper);
 extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
 extern void init_ruby_whisper_model(VALUE *mWhisper);
+extern void init_ruby_whisper_vad_params(VALUE *mVAD);
 extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);

 /*
@ -80,6 +86,14 @@ static VALUE ruby_whisper_s_lang_str_full(VALUE self, VALUE id) {
  return rb_str_new2(str_full);
 }

+/*
+ * call-seq:
+ *   system_info_str -> String
+ */
+static VALUE ruby_whisper_s_system_info_str(VALUE self) {
+  return rb_str_new2(whisper_print_system_info());
+}
+
 static VALUE ruby_whisper_s_finalize_log_callback(VALUE self, VALUE id) {
  is_log_callback_finalized = true;
  return Qnil;
@ -116,16 +130,6 @@ static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_d
  return Qnil;
 }

-static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
-  rb_gc_mark(rwm->context);
-}
-
-static VALUE ruby_whisper_model_allocate(VALUE klass) {
-  ruby_whisper_model *rwm;
-  rwm = ALLOC(ruby_whisper_model);
-  return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
-}
-
 void Init_whisper() {
  id_to_s = rb_intern("to_s");
  id_call = rb_intern("call");
@ -137,8 +141,12 @@ void Init_whisper() {
  id_to_path = rb_intern("to_path");
  id_URI = rb_intern("URI");
  id_pre_converted_models = rb_intern("pre_converted_models");
+  id_coreml_compiled_models = rb_intern("coreml_compiled_models");
+  id_cache = rb_intern("cache");
+  id_n_processors = rb_intern("n_processors");

  mWhisper = rb_define_module("Whisper");
+  mVAD = rb_define_module_under(mWhisper, "VAD");

  rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
  rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
@ -151,6 +159,7 @@ void Init_whisper() {
  rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1);
  rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1);
  rb_define_singleton_method(mWhisper, "lang_str_full", ruby_whisper_s_lang_str_full, 1);
+  rb_define_singleton_method(mWhisper, "system_info_str", ruby_whisper_s_system_info_str, 0);
  rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2);
  rb_define_private_method(rb_singleton_class(mWhisper), "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1);

@ -159,6 +168,9 @@ void Init_whisper() {
  init_ruby_whisper_error(&mWhisper);
  init_ruby_whisper_segment(&mWhisper, &cContext);
  init_ruby_whisper_model(&mWhisper);
+  init_ruby_whisper_vad_params(&mVAD);

+  rb_require("whisper/context");
+  rb_require("whisper/segment");
  rb_require("whisper/model/uri");
 }
--- a/bindings/ruby/ext/ruby_whisper.h
+++ b/bindings/ruby/ext/ruby_whisper.h
@ -21,8 +21,13 @@ typedef struct {
  ruby_whisper_callback_container *progress_callback_container;
  ruby_whisper_callback_container *encoder_begin_callback_container;
  ruby_whisper_callback_container *abort_callback_container;
+  VALUE vad_params;
 } ruby_whisper_params;

+typedef struct {
+  struct whisper_vad_params params;
+} ruby_whisper_vad_params;
+
 typedef struct {
  VALUE context;
  int index;
--- a/bindings/ruby/ext/ruby_whisper_context.c
+++ b/bindings/ruby/ext/ruby_whisper_context.c
@ -11,15 +11,21 @@ extern ID id_new;
 extern ID id_to_path;
 extern ID id_URI;
 extern ID id_pre_converted_models;
+extern ID id_coreml_compiled_models;
+extern ID id_cache;
+extern ID id_n_processors;

 extern VALUE cContext;
 extern VALUE eError;
 extern VALUE cModel;

+extern const rb_data_type_t ruby_whisper_params_type;
 extern VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self);
-extern VALUE rb_whisper_model_initialize(VALUE context);
-extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
-extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
+extern VALUE rb_whisper_model_s_new(VALUE context);
+extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
+extern void prepare_transcription(ruby_whisper_params *rwp, VALUE *context);
+
+ID transcribe_option_names[1];

 static void
 ruby_whisper_free(ruby_whisper *rw)
@ -37,19 +43,74 @@ rb_whisper_mark(ruby_whisper *rw)
 }

 void
-rb_whisper_free(ruby_whisper *rw)
+rb_whisper_free(void *p)
 {
+  ruby_whisper *rw = (ruby_whisper *)p;
  ruby_whisper_free(rw);
  free(rw);
 }

+static size_t
+ruby_whisper_memsize(const void *p)
+{
+  const ruby_whisper *rw = (const ruby_whisper *)p;
+  size_t size = sizeof(rw);
+  if (!rw) {
+    return 0;
+  }
+  if (rw->context) {
+    size += sizeof(rw->context);
+  }
+  return size;
+}
+
+const rb_data_type_t ruby_whisper_type = {
+  "ruby_whisper",
+  {0, rb_whisper_free, ruby_whisper_memsize,},
+  0, 0,
+  0
+};
+
 static VALUE
 ruby_whisper_allocate(VALUE klass)
 {
  ruby_whisper *rw;
-  rw = ALLOC(ruby_whisper);
+  VALUE obj = TypedData_Make_Struct(klass, ruby_whisper, &ruby_whisper_type, rw);
  rw->context = NULL;
-  return Data_Wrap_Struct(klass, rb_whisper_mark, rb_whisper_free, rw);
+  return obj;
+}
+
+VALUE
+ruby_whisper_normalize_model_path(VALUE model_path)
+{
+  VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
+  VALUE pre_converted_model = rb_hash_aref(pre_converted_models, model_path);
+  if (!NIL_P(pre_converted_model)) {
+    model_path = pre_converted_model;
+#ifdef RUBY_WHISPER_USE_COREML
+    VALUE coreml_converted_models = rb_funcall(cModel, id_coreml_compiled_models, 0);
+    VALUE coreml_converted_model = rb_hash_aref(coreml_converted_models, pre_converted_model);
+    if (!NIL_P(coreml_converted_model)) {
+      rb_funcall(coreml_converted_model, id_cache, 0);
+    }
+#endif
+  }
+  else if (TYPE(model_path) == T_STRING) {
+    const char * model_path_str = StringValueCStr(model_path);
+    if (strncmp("http://", model_path_str, 7) == 0 || strncmp("https://", model_path_str, 8) == 0) {
+      VALUE uri_class = rb_const_get(cModel, id_URI);
+      model_path = rb_class_new_instance(1, &model_path, uri_class);
+    }
+  }
+  else if (rb_obj_is_kind_of(model_path, rb_path2class("URI::HTTP"))) {
+    VALUE uri_class = rb_const_get(cModel, id_URI);
+    model_path = rb_class_new_instance(1, &model_path, uri_class);
+  }
+  if (rb_respond_to(model_path, id_to_path)) {
+    model_path = rb_funcall(model_path, id_to_path, 0);
+  }
+
+  return model_path;
 }

 /*
@ -66,27 +127,9 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)

  // TODO: we can support init from buffer here too maybe another ruby object to expose
  rb_scan_args(argc, argv, "01", &whisper_model_file_path);
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);

-  VALUE pre_converted_models = rb_funcall(cModel, id_pre_converted_models, 0);
-  VALUE pre_converted_model = rb_hash_aref(pre_converted_models, whisper_model_file_path);
-  if (!NIL_P(pre_converted_model)) {
-    whisper_model_file_path = pre_converted_model;
-  }
-  if (TYPE(whisper_model_file_path) == T_STRING) {
-    const char * whisper_model_file_path_str = StringValueCStr(whisper_model_file_path);
-    if (strncmp("http://", whisper_model_file_path_str, 7) == 0 || strncmp("https://", whisper_model_file_path_str, 8) == 0) {
-      VALUE uri_class = rb_const_get(cModel, id_URI);
-      whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
-    }
-  }
-  if (rb_obj_is_kind_of(whisper_model_file_path, rb_path2class("URI::HTTP"))) {
-    VALUE uri_class = rb_const_get(cModel, id_URI);
-    whisper_model_file_path = rb_class_new_instance(1, &whisper_model_file_path, uri_class);
-  }
-  if (rb_respond_to(whisper_model_file_path, id_to_path)) {
-    whisper_model_file_path = rb_funcall(whisper_model_file_path, id_to_path, 0);
-  }
+  whisper_model_file_path = ruby_whisper_normalize_model_path(whisper_model_file_path);
  if (!rb_respond_to(whisper_model_file_path, id_to_s)) {
    rb_raise(rb_eRuntimeError, "Expected file path to model to initialize Whisper::Context");
  }
@ -104,7 +147,7 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
 VALUE ruby_whisper_model_n_vocab(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_vocab(rw->context));
 }

@ -115,7 +158,7 @@ VALUE ruby_whisper_model_n_vocab(VALUE self)
 VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_audio_ctx(rw->context));
 }

@ -126,7 +169,7 @@ VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
 VALUE ruby_whisper_model_n_audio_state(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_audio_state(rw->context));
 }

@ -137,7 +180,7 @@ VALUE ruby_whisper_model_n_audio_state(VALUE self)
 VALUE ruby_whisper_model_n_audio_head(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_audio_head(rw->context));
 }

@ -148,7 +191,7 @@ VALUE ruby_whisper_model_n_audio_head(VALUE self)
 VALUE ruby_whisper_model_n_audio_layer(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_audio_layer(rw->context));
 }

@ -159,7 +202,7 @@ VALUE ruby_whisper_model_n_audio_layer(VALUE self)
 VALUE ruby_whisper_model_n_text_ctx(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_text_ctx(rw->context));
 }

@ -170,7 +213,7 @@ VALUE ruby_whisper_model_n_text_ctx(VALUE self)
 VALUE ruby_whisper_model_n_text_state(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_text_state(rw->context));
 }

@ -181,7 +224,7 @@ VALUE ruby_whisper_model_n_text_state(VALUE self)
 VALUE ruby_whisper_model_n_text_head(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_text_head(rw->context));
 }

@ -192,7 +235,7 @@ VALUE ruby_whisper_model_n_text_head(VALUE self)
 VALUE ruby_whisper_model_n_text_layer(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_text_layer(rw->context));
 }

@ -203,7 +246,7 @@ VALUE ruby_whisper_model_n_text_layer(VALUE self)
 VALUE ruby_whisper_model_n_mels(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_mels(rw->context));
 }

@ -214,7 +257,7 @@ VALUE ruby_whisper_model_n_mels(VALUE self)
 VALUE ruby_whisper_model_ftype(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_ftype(rw->context));
 }

@ -225,7 +268,7 @@ VALUE ruby_whisper_model_ftype(VALUE self)
 VALUE ruby_whisper_model_type(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return rb_str_new2(whisper_model_type_readable(rw->context));
 }

@ -248,9 +291,9 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)

  ruby_whisper *rw;
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  VALUE params = argv[0];
-  Data_Get_Struct(params, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  VALUE samples = argv[1];
  int n_samples;
  rb_memory_view_t view;
@ -265,13 +308,20 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
    // Should check when samples.respond_to?(:length)?
  } else {
    if (TYPE(samples) == T_ARRAY) {
-      n_samples = RARRAY_LEN(samples);
+      if (RARRAY_LEN(samples) > INT_MAX) {
+        rb_raise(rb_eArgError, "samples are too long");
+      }
+      n_samples = (int)RARRAY_LEN(samples);
    } else if (memory_view_available_p) {
      if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
        view.obj = Qnil;
        rb_raise(rb_eArgError, "unable to get a memory view");
      }
-      n_samples = view.byte_size / view.item_size;
+      ssize_t n_samples_size = view.byte_size / view.item_size;
+      if (n_samples_size > INT_MAX) {
+        rb_raise(rb_eArgError, "samples are too long");
+      }
+      n_samples = (int)n_samples_size;
    } else if (rb_respond_to(samples, id_length)) {
      n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
    } else {
@ -296,7 +346,7 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
      }
    }
  }
-  register_callbacks(rwp, &self);
+  prepare_transcription(rwp, &self);
  const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples);
  if (0 == result) {
    return self;
@ -327,9 +377,9 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)

  ruby_whisper *rw;
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  VALUE params = argv[0];
-  Data_Get_Struct(params, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  VALUE samples = argv[1];
  int n_samples;
  int n_processors;
@ -359,10 +409,17 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
      view.obj = Qnil;
      rb_raise(rb_eArgError, "unable to get a memory view");
    }
-    n_samples = view.byte_size / view.item_size;
+    ssize_t n_samples_size = view.byte_size / view.item_size;
+    if (n_samples_size > INT_MAX) {
+      rb_raise(rb_eArgError, "samples are too long");
+    }
+    n_samples = (int)n_samples_size;
  } else {
    if (TYPE(samples) == T_ARRAY) {
-      n_samples = RARRAY_LEN(samples);
+      if (RARRAY_LEN(samples) > INT_MAX) {
+        rb_raise(rb_eArgError, "samples are too long");
+      }
+      n_samples = (int)RARRAY_LEN(samples);
    } else if (rb_respond_to(samples, id_length)) {
      n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
    } else {
@ -387,7 +444,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
      }
    }
  }
-  register_callbacks(rwp, &self);
+  prepare_transcription(rwp, &self);
  const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors);
  if (0 == result) {
    return self;
@ -406,7 +463,7 @@ static VALUE
 ruby_whisper_full_n_segments(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_full_n_segments(rw->context));
 }

@ -420,7 +477,7 @@ static VALUE
 ruby_whisper_full_lang_id(VALUE self)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_full_lang_id(rw->context));
 }

@ -445,10 +502,10 @@ static VALUE
 ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
  const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
-  return INT2NUM(t0);
+  return LONG2NUM(t0);
 }

 /*
@ -463,10 +520,10 @@ static VALUE
 ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
  const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
-  return INT2NUM(t1);
+  return LONG2NUM(t1);
 }

 /*
@ -481,7 +538,7 @@ static VALUE
 ruby_whisper_full_get_segment_speaker_turn_next(VALUE self, VALUE i_segment)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
  const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(rw->context, c_i_segment);
  return speaker_turn_next ? Qtrue : Qfalse;
@ -499,7 +556,7 @@ static VALUE
 ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
  const char * text = whisper_full_get_segment_text(rw->context, c_i_segment);
  return rb_str_new2(text);
@ -513,7 +570,7 @@ static VALUE
 ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
 {
  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
  const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
  const float no_speech_prob = whisper_full_get_segment_no_speech_prob(rw->context, c_i_segment);
  return DBL2NUM(no_speech_prob);
@ -524,7 +581,7 @@ ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
 static VALUE
 ruby_whisper_full_get_segment(VALUE self, VALUE i_segment)
 {
-  return rb_whisper_segment_initialize(self, NUM2INT(i_segment));
+  return rb_whisper_segment_s_new(self, NUM2INT(i_segment));
 }

 /*
@ -554,11 +611,11 @@ ruby_whisper_each_segment(VALUE self)
  }

  ruby_whisper *rw;
-  Data_Get_Struct(self, ruby_whisper, rw);
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);

  const int n_segments = whisper_full_n_segments(rw->context);
  for (int i = 0; i < n_segments; ++i) {
-    rb_yield(rb_whisper_segment_initialize(self, i));
+    rb_yield(rb_whisper_segment_s_new(self, i));
  }

  return self;
@ -571,7 +628,7 @@ ruby_whisper_each_segment(VALUE self)
 static VALUE
 ruby_whisper_get_model(VALUE self)
 {
-  return rb_whisper_model_initialize(self);
+  return rb_whisper_model_s_new(self);
 }

 void
@ -579,6 +636,8 @@ init_ruby_whisper_context(VALUE *mWhisper)
 {
  cContext = rb_define_class_under(*mWhisper, "Context", rb_cObject);

+  transcribe_option_names[0] = id_n_processors;
+
  rb_define_alloc_func(cContext, ruby_whisper_allocate);
  rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);

@ -605,7 +664,7 @@ init_ruby_whisper_context(VALUE *mWhisper)
  rb_define_method(cContext, "full", ruby_whisper_full, -1);
  rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);

-  // High leve
+  // High level
  rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
  rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);

--- a/bindings/ruby/ext/ruby_whisper_model.c
+++ b/bindings/ruby/ext/ruby_whisper_model.c
@ -1,22 +1,44 @@
 #include <ruby.h>
 #include "ruby_whisper.h"

+extern const rb_data_type_t ruby_whisper_type;
+
 extern VALUE cModel;

-static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
-  rb_gc_mark(rwm->context);
+static void rb_whisper_model_mark(void *p) {
+  ruby_whisper_model *rwm = (ruby_whisper_model *)p;
+  if (rwm->context) {
+    rb_gc_mark(rwm->context);
+  }
 }

+static size_t
+ruby_whisper_model_memsize(const void *p)
+{
+  const ruby_whisper_model *rwm = (const ruby_whisper_model *)p;
+  size_t size = sizeof(rwm);
+  if (!rwm) {
+    return 0;
+  }
+  return size;
+}
+
+static const rb_data_type_t rb_whisper_model_type = {
+  "ruby_whisper_model",
+  {rb_whisper_model_mark, RUBY_DEFAULT_FREE, ruby_whisper_model_memsize,},
+  0, 0,
+  0
+};
+
 static VALUE ruby_whisper_model_allocate(VALUE klass) {
  ruby_whisper_model *rwm;
-  rwm = ALLOC(ruby_whisper_model);
-  return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
+  return TypedData_Make_Struct(klass, ruby_whisper_model, &rb_whisper_model_type, rwm);
 }

-VALUE rb_whisper_model_initialize(VALUE context) {
+VALUE rb_whisper_model_s_new(VALUE context) {
  ruby_whisper_model *rwm;
  const VALUE model = ruby_whisper_model_allocate(cModel);
-  Data_Get_Struct(model, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(model, ruby_whisper_model, &rb_whisper_model_type, rwm);
  rwm->context = context;
  return model;
 };
@ -29,9 +51,9 @@ static VALUE
 ruby_whisper_model_n_vocab(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_vocab(rw->context));
 }

@ -43,9 +65,9 @@ static VALUE
 ruby_whisper_model_n_audio_ctx(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_audio_ctx(rw->context));
 }

@ -57,9 +79,9 @@ static VALUE
 ruby_whisper_model_n_audio_state(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_audio_state(rw->context));
 }

@ -71,9 +93,9 @@ static VALUE
 ruby_whisper_model_n_audio_head(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_audio_head(rw->context));
 }

@ -85,9 +107,9 @@ static VALUE
 ruby_whisper_model_n_audio_layer(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_audio_layer(rw->context));
 }

@ -99,9 +121,9 @@ static VALUE
 ruby_whisper_model_n_text_ctx(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_text_ctx(rw->context));
 }

@ -113,9 +135,9 @@ static VALUE
 ruby_whisper_model_n_text_state(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_text_state(rw->context));
 }

@ -127,9 +149,9 @@ static VALUE
 ruby_whisper_model_n_text_head(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_text_head(rw->context));
 }

@ -141,9 +163,9 @@ static VALUE
 ruby_whisper_model_n_text_layer(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_text_layer(rw->context));
 }

@ -155,9 +177,9 @@ static VALUE
 ruby_whisper_model_n_mels(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_n_mels(rw->context));
 }

@ -169,9 +191,9 @@ static VALUE
 ruby_whisper_model_ftype(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return INT2NUM(whisper_model_ftype(rw->context));
 }

@ -183,9 +205,9 @@ static VALUE
 ruby_whisper_model_type(VALUE self)
 {
  ruby_whisper_model *rwm;
-  Data_Get_Struct(self, ruby_whisper_model, rwm);
+  TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
  ruby_whisper *rw;
-  Data_Get_Struct(rwm->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
  return rb_str_new2(whisper_model_type_readable(rw->context));
 }

--- a/bindings/ruby/ext/ruby_whisper_params.c
+++ b/bindings/ruby/ext/ruby_whisper_params.c
@ -3,7 +3,7 @@

 #define BOOL_PARAMS_SETTER(self, prop, value) \
  ruby_whisper_params *rwp; \
-  Data_Get_Struct(self, ruby_whisper_params, rwp); \
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp); \
  if (value == Qfalse || value == Qnil) { \
    rwp->params.prop = false; \
  } else { \
@ -13,7 +13,7 @@

 #define BOOL_PARAMS_GETTER(self,  prop) \
  ruby_whisper_params *rwp; \
-  Data_Get_Struct(self, ruby_whisper_params, rwp); \
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp); \
  if (rwp->params.prop) { \
    return Qtrue; \
  } else { \
@ -26,13 +26,16 @@
  rb_define_method(cParams, #param_name, ruby_whisper_params_get_ ## param_name, 0); \
  rb_define_method(cParams, #param_name "=", ruby_whisper_params_set_ ## param_name, 1);

-#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 32
+#define RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT 35

 extern VALUE cParams;
+extern VALUE cVADParams;

 extern ID id_call;

-extern VALUE rb_whisper_segment_initialize(VALUE context, int index);
+extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
+extern VALUE rb_whisper_segment_s_new(VALUE context, int index);
+extern const rb_data_type_t ruby_whisper_vad_params_type;

 static ID param_names[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT];
 static ID id_language;
@ -67,10 +70,15 @@ static ID id_encoder_begin_callback;
 static ID id_encoder_begin_callback_user_data;
 static ID id_abort_callback;
 static ID id_abort_callback_user_data;
+static ID id_vad;
+static ID id_vad_model_path;
+static ID id_vad_params;

 static void
 rb_whisper_callbcack_container_mark(ruby_whisper_callback_container *rwc)
 {
+  if (rwc == NULL) return;
+
  rb_gc_mark(rwc->user_data);
  rb_gc_mark(rwc->callback);
  rb_gc_mark(rwc->callbacks);
@ -102,7 +110,7 @@ static void new_segment_callback(struct whisper_context *ctx, struct whisper_sta
  const int n_segments = whisper_full_n_segments_from_state(state);
  for (int i = n_new; i > 0; i--) {
    int i_segment = n_segments - i;
-    VALUE segment = rb_whisper_segment_initialize(*container->context, i_segment);
+    VALUE segment = rb_whisper_segment_s_new(*container->context, i_segment);
    for (int j = 0; j < callbacks_len; j++) {
      VALUE cb = rb_ary_entry(container->callbacks, j);
      rb_funcall(cb, id_call, 1, segment);
@ -177,7 +185,7 @@ static bool abort_callback(void * user_data) {
  return false;
 }

-void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
+static void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
  if (!NIL_P(rwp->new_segment_callback_container->callback) || 0 != RARRAY_LEN(rwp->new_segment_callback_container->callbacks)) {
    rwp->new_segment_callback_container->context = context;
    rwp->params.new_segment_callback = new_segment_callback;
@ -203,13 +211,29 @@ void register_callbacks(ruby_whisper_params * rwp, VALUE * context) {
  }
 }

-void
-rb_whisper_params_mark(ruby_whisper_params *rwp)
+static void set_vad_params(ruby_whisper_params *rwp)
 {
+  ruby_whisper_vad_params * rwvp;
+  TypedData_Get_Struct(rwp->vad_params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  rwp->params.vad_params = rwvp->params;
+}
+
+void
+prepare_transcription(ruby_whisper_params *rwp, VALUE *context)
+{
+  register_callbacks(rwp, context);
+  set_vad_params(rwp);
+}
+
+void
+rb_whisper_params_mark(void *p)
+{
+  ruby_whisper_params *rwp = (ruby_whisper_params *)p;
  rb_whisper_callbcack_container_mark(rwp->new_segment_callback_container);
  rb_whisper_callbcack_container_mark(rwp->progress_callback_container);
  rb_whisper_callbcack_container_mark(rwp->encoder_begin_callback_container);
  rb_whisper_callbcack_container_mark(rwp->abort_callback_container);
+  rb_gc_mark(rwp->vad_params);
 }

 void
@ -218,25 +242,46 @@ ruby_whisper_params_free(ruby_whisper_params *rwp)
 }

 void
-rb_whisper_params_free(ruby_whisper_params *rwp)
+rb_whisper_params_free(void *p)
 {
+  ruby_whisper_params *rwp = (ruby_whisper_params *)p;
  // How to free user_data and callback only when not referred to by others?
  ruby_whisper_params_free(rwp);
  free(rwp);
 }

+static size_t
+ruby_whisper_params_memsize(const void *p)
+{
+  const ruby_whisper_params *rwp = (const ruby_whisper_params *)p;
+
+  return sizeof(ruby_whisper_params) + sizeof(rwp->params) + sizeof(rwp->vad_params);
+}
+
+const rb_data_type_t ruby_whisper_params_type = {
+  "ruby_whisper_params",
+  {
+    rb_whisper_params_mark,
+    rb_whisper_params_free,
+    ruby_whisper_params_memsize,
+  },
+  0, 0,
+  0
+};
+
 static VALUE
 ruby_whisper_params_allocate(VALUE klass)
 {
  ruby_whisper_params *rwp;
-  rwp = ALLOC(ruby_whisper_params);
+  VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
  rwp->diarize = false;
+  rwp->vad_params = TypedData_Wrap_Struct(cVADParams, &ruby_whisper_vad_params_type, (void *)&rwp->params.vad_params);
  rwp->new_segment_callback_container = rb_whisper_callback_container_allocate();
  rwp->progress_callback_container = rb_whisper_callback_container_allocate();
  rwp->encoder_begin_callback_container = rb_whisper_callback_container_allocate();
  rwp->abort_callback_container = rb_whisper_callback_container_allocate();
-  return Data_Wrap_Struct(klass, rb_whisper_params_mark, rb_whisper_params_free, rwp);
+  return obj;
 }

 /*
@ -249,7 +294,7 @@ static VALUE
 ruby_whisper_params_set_language(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  if (value == Qfalse || value == Qnil) {
    rwp->params.language = "auto";
  } else {
@ -265,7 +310,7 @@ static VALUE
 ruby_whisper_params_get_language(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  if (rwp->params.language) {
    return rb_str_new2(rwp->params.language);
  } else {
@ -502,7 +547,7 @@ static VALUE
 ruby_whisper_params_get_initial_prompt(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->params.initial_prompt == NULL ? Qnil : rb_str_new2(rwp->params.initial_prompt);
 }
 /*
@ -513,7 +558,7 @@ static VALUE
 ruby_whisper_params_set_initial_prompt(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.initial_prompt = StringValueCStr(value);
  return value;
 }
@ -527,7 +572,7 @@ static VALUE
 ruby_whisper_params_get_diarize(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  if (rwp->diarize) {
    return Qtrue;
  } else {
@ -542,7 +587,7 @@ static VALUE
 ruby_whisper_params_set_diarize(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  if (value == Qfalse || value == Qnil) {
    rwp->diarize = false;
  } else {
@ -561,7 +606,7 @@ static VALUE
 ruby_whisper_params_get_offset(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return INT2NUM(rwp->params.offset_ms);
 }
 /*
@ -572,7 +617,7 @@ static VALUE
 ruby_whisper_params_set_offset(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.offset_ms = NUM2INT(value);
  return value;
 }
@ -586,7 +631,7 @@ static VALUE
 ruby_whisper_params_get_duration(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return INT2NUM(rwp->params.duration_ms);
 }
 /*
@ -597,7 +642,7 @@ static VALUE
 ruby_whisper_params_set_duration(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.duration_ms = NUM2INT(value);
  return value;
 }
@ -612,7 +657,7 @@ static VALUE
 ruby_whisper_params_get_max_text_tokens(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return INT2NUM(rwp->params.n_max_text_ctx);
 }
 /*
@ -623,7 +668,7 @@ static VALUE
 ruby_whisper_params_set_max_text_tokens(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.n_max_text_ctx = NUM2INT(value);
  return value;
 }
@ -635,7 +680,7 @@ static VALUE
 ruby_whisper_params_get_temperature(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return DBL2NUM(rwp->params.temperature);
 }
 /*
@ -646,7 +691,7 @@ static VALUE
 ruby_whisper_params_set_temperature(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.temperature = RFLOAT_VALUE(value);
  return value;
 }
@ -660,7 +705,7 @@ static VALUE
 ruby_whisper_params_get_max_initial_ts(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return DBL2NUM(rwp->params.max_initial_ts);
 }
 /*
@ -671,7 +716,7 @@ static VALUE
 ruby_whisper_params_set_max_initial_ts(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.max_initial_ts = RFLOAT_VALUE(value);
  return value;
 }
@ -683,7 +728,7 @@ static VALUE
 ruby_whisper_params_get_length_penalty(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return DBL2NUM(rwp->params.length_penalty);
 }
 /*
@ -694,7 +739,7 @@ static VALUE
 ruby_whisper_params_set_length_penalty(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.length_penalty = RFLOAT_VALUE(value);
  return value;
 }
@ -706,7 +751,7 @@ static VALUE
 ruby_whisper_params_get_temperature_inc(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return DBL2NUM(rwp->params.temperature_inc);
 }
 /*
@ -717,7 +762,7 @@ static VALUE
 ruby_whisper_params_set_temperature_inc(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.temperature_inc = RFLOAT_VALUE(value);
  return value;
 }
@ -731,7 +776,7 @@ static VALUE
 ruby_whisper_params_get_entropy_thold(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return DBL2NUM(rwp->params.entropy_thold);
 }
 /*
@ -742,7 +787,7 @@ static VALUE
 ruby_whisper_params_set_entropy_thold(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.entropy_thold = RFLOAT_VALUE(value);
  return value;
 }
@ -754,7 +799,7 @@ static VALUE
 ruby_whisper_params_get_logprob_thold(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return DBL2NUM(rwp->params.logprob_thold);
 }
 /*
@ -765,7 +810,7 @@ static VALUE
 ruby_whisper_params_set_logprob_thold(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.logprob_thold = RFLOAT_VALUE(value);
  return value;
 }
@ -777,7 +822,7 @@ static VALUE
 ruby_whisper_params_get_no_speech_thold(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return DBL2NUM(rwp->params.no_speech_thold);
 }
 /*
@ -788,7 +833,7 @@ static VALUE
 ruby_whisper_params_set_no_speech_thold(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->params.no_speech_thold = RFLOAT_VALUE(value);
  return value;
 }
@ -796,7 +841,7 @@ static VALUE
 ruby_whisper_params_get_new_segment_callback(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->new_segment_callback_container->callback;
 }
 /*
@ -813,7 +858,7 @@ static VALUE
 ruby_whisper_params_set_new_segment_callback(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->new_segment_callback_container->callback = value;
  return value;
 }
@ -821,7 +866,7 @@ static VALUE
 ruby_whisper_params_get_new_segment_callback_user_data(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->new_segment_callback_container->user_data;
 }
 /*
@ -834,7 +879,7 @@ static VALUE
 ruby_whisper_params_set_new_segment_callback_user_data(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->new_segment_callback_container->user_data = value;
  return value;
 }
@ -842,7 +887,7 @@ static VALUE
 ruby_whisper_params_get_progress_callback(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->progress_callback_container->callback;
 }
 /*
@ -861,7 +906,7 @@ static VALUE
 ruby_whisper_params_set_progress_callback(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->progress_callback_container->callback = value;
  return value;
 }
@ -869,7 +914,7 @@ static VALUE
 ruby_whisper_params_get_progress_callback_user_data(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->progress_callback_container->user_data;
 }
 /*
@ -882,7 +927,7 @@ static VALUE
 ruby_whisper_params_set_progress_callback_user_data(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->progress_callback_container->user_data = value;
  return value;
 }
@ -891,7 +936,7 @@ static VALUE
 ruby_whisper_params_get_encoder_begin_callback(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->encoder_begin_callback_container->callback;
 }

@ -909,7 +954,7 @@ static VALUE
 ruby_whisper_params_set_encoder_begin_callback(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->encoder_begin_callback_container->callback = value;
  return value;
 }
@ -918,7 +963,7 @@ static VALUE
 ruby_whisper_params_get_encoder_begin_callback_user_data(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->encoder_begin_callback_container->user_data;
 }

@ -932,7 +977,7 @@ static VALUE
 ruby_whisper_params_set_encoder_begin_callback_user_data(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->encoder_begin_callback_container->user_data = value;
  return value;
 }
@ -941,7 +986,7 @@ static VALUE
 ruby_whisper_params_get_abort_callback(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->abort_callback_container->callback;
 }
 /*
@ -958,7 +1003,7 @@ static VALUE
 ruby_whisper_params_set_abort_callback(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->abort_callback_container->callback = value;
  return value;
 }
@ -966,7 +1011,7 @@ static VALUE
 ruby_whisper_params_get_abort_callback_user_data(VALUE self)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  return rwp->abort_callback_container->user_data;
 }
 /*
@ -979,11 +1024,74 @@ static VALUE
 ruby_whisper_params_set_abort_callback_user_data(VALUE self, VALUE value)
 {
  ruby_whisper_params *rwp;
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  rwp->abort_callback_container->user_data = value;
  return value;
 }

+/*
+ * call-seq:
+ *   vad = use_vad -> use_vad
+ */
+static VALUE
+ruby_whisper_params_get_vad(VALUE self)
+{
+  BOOL_PARAMS_GETTER(self, vad)
+}
+
+static VALUE
+ruby_whisper_params_set_vad(VALUE self, VALUE value)
+{
+  BOOL_PARAMS_SETTER(self, vad, value)
+}
+
+/*
+ * call-seq:
+ *   vad_model_path = model_path -> model_path
+ */
+static VALUE
+ruby_whisper_params_set_vad_model_path(VALUE self, VALUE value)
+{
+  ruby_whisper_params *rwp;
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  if (NIL_P(value)) {
+    rwp->params.vad_model_path = NULL;
+    return value;
+  }
+  VALUE path = ruby_whisper_normalize_model_path(value);
+  rwp->params.vad_model_path = StringValueCStr(path);
+  return value;
+}
+
+static VALUE
+ruby_whisper_params_get_vad_model_path(VALUE self)
+{
+  ruby_whisper_params *rwp;
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  return rwp->params.vad_model_path == NULL ? Qnil : rb_str_new2(rwp->params.vad_model_path);
+}
+
+/*
+ * call-seq:
+ *   vad_params = params -> params
+ */
+static VALUE
+ruby_whisper_params_set_vad_params(VALUE self, VALUE value)
+{
+  ruby_whisper_params *rwp;
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  rwp->vad_params = value;
+  return value;
+}
+
+static VALUE
+ruby_whisper_params_get_vad_params(VALUE self)
+{
+  ruby_whisper_params *rwp;
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
+  return rwp->vad_params;
+}
+
 #define SET_PARAM_IF_SAME(param_name) \
  if (id == id_ ## param_name) { \
    ruby_whisper_params_set_ ## param_name(self, value); \
@ -993,7 +1101,6 @@ ruby_whisper_params_set_abort_callback_user_data(VALUE self, VALUE value)
 static VALUE
 ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
 {
-
  VALUE kw_hash;
  VALUE values[RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT] = {Qundef};
  VALUE value;
@ -1007,7 +1114,7 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
  }

  rb_get_kwargs(kw_hash, param_names, 0, RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT, values);
-  Data_Get_Struct(self, ruby_whisper_params, rwp);
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);

  for (i = 0; i < RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT; i++) {
    id = param_names[i];
@ -1050,6 +1157,9 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
      SET_PARAM_IF_SAME(encoder_begin_callback_user_data)
      SET_PARAM_IF_SAME(abort_callback)
      SET_PARAM_IF_SAME(abort_callback_user_data)
+      SET_PARAM_IF_SAME(vad)
+      SET_PARAM_IF_SAME(vad_model_path)
+      SET_PARAM_IF_SAME(vad_params)
    }
  }

@ -1071,10 +1181,10 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
 static VALUE
 ruby_whisper_params_on_new_segment(VALUE self)
 {
-  ruby_whisper_params *rws;
-  Data_Get_Struct(self, ruby_whisper_params, rws);
+  ruby_whisper_params *rwp;
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  const VALUE blk = rb_block_proc();
-  rb_ary_push(rws->new_segment_callback_container->callbacks, blk);
+  rb_ary_push(rwp->new_segment_callback_container->callbacks, blk);
  return Qnil;
 }

@ -1091,10 +1201,10 @@ ruby_whisper_params_on_new_segment(VALUE self)
 static VALUE
 ruby_whisper_params_on_progress(VALUE self)
 {
-  ruby_whisper_params *rws;
-  Data_Get_Struct(self, ruby_whisper_params, rws);
+  ruby_whisper_params *rwp;
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  const VALUE blk = rb_block_proc();
-  rb_ary_push(rws->progress_callback_container->callbacks, blk);
+  rb_ary_push(rwp->progress_callback_container->callbacks, blk);
  return Qnil;
 }

@ -1111,10 +1221,10 @@ ruby_whisper_params_on_progress(VALUE self)
 static VALUE
 ruby_whisper_params_on_encoder_begin(VALUE self)
 {
-  ruby_whisper_params *rws;
-  Data_Get_Struct(self, ruby_whisper_params, rws);
+  ruby_whisper_params *rwp;
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  const VALUE blk = rb_block_proc();
-  rb_ary_push(rws->encoder_begin_callback_container->callbacks, blk);
+  rb_ary_push(rwp->encoder_begin_callback_container->callbacks, blk);
  return Qnil;
 }

@ -1135,10 +1245,10 @@ ruby_whisper_params_on_encoder_begin(VALUE self)
 static VALUE
 ruby_whisper_params_abort_on(VALUE self)
 {
-  ruby_whisper_params *rws;
-  Data_Get_Struct(self, ruby_whisper_params, rws);
+  ruby_whisper_params *rwp;
+  TypedData_Get_Struct(self, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  const VALUE blk = rb_block_proc();
-  rb_ary_push(rws->abort_callback_container->callbacks, blk);
+  rb_ary_push(rwp->abort_callback_container->callbacks, blk);
  return Qnil;
 }

@ -1182,6 +1292,9 @@ init_ruby_whisper_params(VALUE *mWhisper)
  DEFINE_PARAM(encoder_begin_callback_user_data, 29)
  DEFINE_PARAM(abort_callback, 30)
  DEFINE_PARAM(abort_callback_user_data, 31)
+  DEFINE_PARAM(vad, 32)
+  DEFINE_PARAM(vad_model_path, 33)
+  DEFINE_PARAM(vad_params, 34)

  rb_define_method(cParams, "on_new_segment", ruby_whisper_params_on_new_segment, 0);
  rb_define_method(cParams, "on_progress", ruby_whisper_params_on_progress, 0);
--- a/bindings/ruby/ext/ruby_whisper_segment.c
+++ b/bindings/ruby/ext/ruby_whisper_segment.c
@ -1,28 +1,57 @@
 #include <ruby.h>
 #include "ruby_whisper.h"

+#define N_KEY_NAMES 5
+
+static VALUE sym_start_time;
+static VALUE sym_end_time;
+static VALUE sym_text;
+static VALUE sym_no_speech_prob;
+static VALUE sym_speaker_turn_next;
+static VALUE key_names;
+
+extern const rb_data_type_t ruby_whisper_type;
+
 extern VALUE cSegment;

 static void
-rb_whisper_segment_mark(ruby_whisper_segment *rws)
+rb_whisper_segment_mark(void *p)
 {
+  ruby_whisper_segment *rws = (ruby_whisper_segment *)p;
  rb_gc_mark(rws->context);
 }

+static size_t
+ruby_whisper_segment_memsize(const void *p)
+{
+  const ruby_whisper_segment *rws = (const ruby_whisper_segment *)p;
+  size_t size = sizeof(rws);
+  if (!rws) {
+    return 0;
+  }
+  return size;
+}
+
+static const rb_data_type_t ruby_whisper_segment_type = {
+  "ruby_whisper_segment",
+  {rb_whisper_segment_mark, RUBY_DEFAULT_FREE, ruby_whisper_segment_memsize,},
+  0, 0,
+  0
+};
+
 VALUE
 ruby_whisper_segment_allocate(VALUE klass)
 {
  ruby_whisper_segment *rws;
-  rws = ALLOC(ruby_whisper_segment);
-  return Data_Wrap_Struct(klass, rb_whisper_segment_mark, RUBY_DEFAULT_FREE, rws);
+  return TypedData_Make_Struct(klass, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
 }

 VALUE
-rb_whisper_segment_initialize(VALUE context, int index)
+rb_whisper_segment_s_new(VALUE context, int index)
 {
  ruby_whisper_segment *rws;
  const VALUE segment = ruby_whisper_segment_allocate(cSegment);
-  Data_Get_Struct(segment, ruby_whisper_segment, rws);
+  TypedData_Get_Struct(segment, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
  rws->context = context;
  rws->index = index;
  return segment;
@ -38,12 +67,12 @@ static VALUE
 ruby_whisper_segment_get_start_time(VALUE self)
 {
  ruby_whisper_segment *rws;
-  Data_Get_Struct(self, ruby_whisper_segment, rws);
+  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
  ruby_whisper *rw;
-  Data_Get_Struct(rws->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
  const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
  // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
-  return INT2NUM(t0 * 10);
+  return LONG2NUM(t0 * 10);
 }

 /*
@ -56,12 +85,12 @@ static VALUE
 ruby_whisper_segment_get_end_time(VALUE self)
 {
  ruby_whisper_segment *rws;
-  Data_Get_Struct(self, ruby_whisper_segment, rws);
+  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
  ruby_whisper *rw;
-  Data_Get_Struct(rws->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
  const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
  // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
-  return INT2NUM(t1 * 10);
+  return LONG2NUM(t1 * 10);
 }

 /*
@ -74,9 +103,9 @@ static VALUE
 ruby_whisper_segment_get_speaker_turn_next(VALUE self)
 {
  ruby_whisper_segment *rws;
-  Data_Get_Struct(self, ruby_whisper_segment, rws);
+  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
  ruby_whisper *rw;
-  Data_Get_Struct(rws->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
  return whisper_full_get_segment_speaker_turn_next(rw->context, rws->index) ? Qtrue : Qfalse;
 }

@ -88,9 +117,9 @@ static VALUE
 ruby_whisper_segment_get_text(VALUE self)
 {
  ruby_whisper_segment *rws;
-  Data_Get_Struct(self, ruby_whisper_segment, rws);
+  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
  ruby_whisper *rw;
-  Data_Get_Struct(rws->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
  const char * text = whisper_full_get_segment_text(rw->context, rws->index);
  return rb_str_new2(text);
 }
@ -103,21 +132,89 @@ static VALUE
 ruby_whisper_segment_get_no_speech_prob(VALUE self)
 {
  ruby_whisper_segment *rws;
-  Data_Get_Struct(self, ruby_whisper_segment, rws);
+  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
  ruby_whisper *rw;
-  Data_Get_Struct(rws->context, ruby_whisper, rw);
+  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
  return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
 }

+/*
+ * call-seq:
+ *   deconstruct_keys(keys) -> hash
+ *
+ *  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
+ *
+ *   whisper.each_segment do |segment|
+ *     segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+ *
+ *     puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
+ *   end
+ */
+static VALUE
+ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
+{
+  ruby_whisper_segment *rws;
+  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  ruby_whisper *rw;
+  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+
+  VALUE hash = rb_hash_new();
+  long n_keys;
+  if (NIL_P(keys)) {
+    keys = key_names;
+    n_keys = N_KEY_NAMES;
+  } else {
+    n_keys = RARRAY_LEN(keys);
+    if (n_keys > N_KEY_NAMES) {
+      return hash;
+    }
+  }
+  for (int i = 0; i < n_keys; i++) {
+    VALUE key = rb_ary_entry(keys, i);
+    if (key == sym_start_time) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
+    }
+    if (key == sym_end_time) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
+    }
+    if (key == sym_text) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
+    }
+    if (key == sym_no_speech_prob) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
+    }
+    if (key == sym_speaker_turn_next) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
+    }
+  }
+
+  return hash;
+}
+
 void
 init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
 {
  cSegment  = rb_define_class_under(*mWhisper, "Segment", rb_cObject);

+  sym_start_time = ID2SYM(rb_intern("start_time"));
+  sym_end_time = ID2SYM(rb_intern("end_time"));
+  sym_text = ID2SYM(rb_intern("text"));
+  sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
+  sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
+  key_names = rb_ary_new3(
+    N_KEY_NAMES,
+    sym_start_time,
+    sym_end_time,
+    sym_text,
+    sym_no_speech_prob,
+    sym_speaker_turn_next
+  );
+
  rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
  rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
  rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
-  rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
+  rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
  rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
  rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
+  rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
 }
--- a/bindings/ruby/ext/ruby_whisper_transcribe.cpp
+++ b/bindings/ruby/ext/ruby_whisper_transcribe.cpp
@ -8,11 +8,15 @@
 extern "C" {
 #endif

+extern const rb_data_type_t ruby_whisper_type;
+extern const rb_data_type_t ruby_whisper_params_type;
+
 extern ID id_to_s;
 extern ID id_call;
+extern ID transcribe_option_names[1];

 extern void
-register_callbacks(ruby_whisper_params * rwp, VALUE * self);
+prepare_transcription(ruby_whisper_params * rwp, VALUE * self);

 /*
 * transcribe a single file
@ -31,11 +35,16 @@ VALUE
 ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
  ruby_whisper *rw;
  ruby_whisper_params *rwp;
-  VALUE wave_file_path, blk, params;
+  VALUE wave_file_path, blk, params, kws;
+  VALUE opts[1];

-  rb_scan_args(argc, argv, "02&", &wave_file_path, &params, &blk);
-  Data_Get_Struct(self, ruby_whisper, rw);
-  Data_Get_Struct(params, ruby_whisper_params, rwp);
+  rb_scan_args_kw(RB_SCAN_ARGS_LAST_HASH_KEYWORDS, argc, argv, "2:&", &wave_file_path, &params, &kws, &blk);
+  rb_get_kwargs(kws, transcribe_option_names, 0, 1, opts);
+
+  int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);
+
+  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);

  if (!rb_respond_to(wave_file_path, id_to_s)) {
    rb_raise(rb_eRuntimeError, "Expected file path to wave file");
@ -61,22 +70,22 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
  //   rwp->params.encoder_begin_callback_user_data = &is_aborted;
  // }

-  register_callbacks(rwp, &self);
+  prepare_transcription(rwp, &self);

-  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), 1) != 0) {
+  if (whisper_full_parallel(rw->context, rwp->params, pcmf32.data(), pcmf32.size(), n_processors) != 0) {
    fprintf(stderr, "failed to process audio\n");
    return self;
  }
+  if (NIL_P(blk)) {
+    return self;
+  }
  const int n_segments = whisper_full_n_segments(rw->context);
  VALUE output = rb_str_new2("");
  for (int i = 0; i < n_segments; ++i) {
    const char * text = whisper_full_get_segment_text(rw->context, i);
    output = rb_str_concat(output, rb_str_new2(text));
  }
-  VALUE idCall = id_call;
-  if (blk != Qnil) {
-    rb_funcall(blk, idCall, 1, output);
-  }
+  rb_funcall(blk, id_call, 1, output);
  return self;
 }
 #ifdef __cplusplus
--- a/bindings/ruby/ext/ruby_whisper_vad_params.c
+++ b/bindings/ruby/ext/ruby_whisper_vad_params.c
@ -0,0 +1,288 @@
+#include <ruby.h>
+#include "ruby_whisper.h"
+
+#define DEFINE_PARAM(param_name, nth) \
+  id_ ## param_name = rb_intern(#param_name); \
+  param_names[nth] = id_ ## param_name; \
+  rb_define_method(cVADParams, #param_name, ruby_whisper_vad_params_get_ ## param_name, 0); \
+  rb_define_method(cVADParams, #param_name "=", ruby_whisper_vad_params_set_ ## param_name, 1);
+
+#define NUM_PARAMS 6
+
+extern VALUE cVADParams;
+
+static size_t
+ruby_whisper_vad_params_memsize(const void *p)
+{
+  const struct ruby_whisper_vad_params *params = p;
+  size_t size = sizeof(params);
+  if (!params) {
+    return 0;
+  }
+  return size;
+}
+
+static ID param_names[NUM_PARAMS];
+static ID id_threshold;
+static ID id_min_speech_duration_ms;
+static ID id_min_silence_duration_ms;
+static ID id_max_speech_duration_s;
+static ID id_speech_pad_ms;
+static ID id_samples_overlap;
+
+const rb_data_type_t ruby_whisper_vad_params_type = {
+  "ruby_whisper_vad_params",
+  {0, 0, ruby_whisper_vad_params_memsize,},
+  0, 0,
+  0
+};
+
+static VALUE
+ruby_whisper_vad_params_s_allocate(VALUE klass)
+{
+  ruby_whisper_vad_params *rwvp;
+  VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  rwvp->params = whisper_vad_default_params();
+  return obj;
+}
+
+/*
+ * Probability threshold to consider as speech.
+ *
+ * call-seq:
+ *   threshold = th -> th
+ */
+static VALUE
+ruby_whisper_vad_params_set_threshold(VALUE self, VALUE value)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  rwvp->params.threshold = RFLOAT_VALUE(value);
+  return value;
+}
+
+static VALUE
+ruby_whisper_vad_params_get_threshold(VALUE self)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  return DBL2NUM(rwvp->params.threshold);
+}
+
+/*
+ * Min duration for a valid speech segment.
+ *
+ * call-seq:
+ *   min_speech_duration_ms = duration_ms -> duration_ms
+ */
+static VALUE
+ruby_whisper_vad_params_set_min_speech_duration_ms(VALUE self, VALUE value)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  rwvp->params.min_speech_duration_ms = NUM2INT(value);
+  return value;
+}
+
+static VALUE
+ruby_whisper_vad_params_get_min_speech_duration_ms(VALUE self)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  return INT2NUM(rwvp->params.min_speech_duration_ms);
+}
+
+/*
+ * Min silence duration to consider speech as ended.
+ *
+ * call-seq:
+ *   min_silence_duration_ms = duration_ms -> duration_ms
+ */
+static VALUE
+ruby_whisper_vad_params_set_min_silence_duration_ms(VALUE self, VALUE value)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  rwvp->params.min_silence_duration_ms = NUM2INT(value);
+  return value;
+}
+
+static VALUE
+ruby_whisper_vad_params_get_min_silence_duration_ms(VALUE self)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  return INT2NUM(rwvp->params.min_silence_duration_ms);
+}
+
+/*
+ * Max duration of a speech segment before forcing a new segment.
+ *
+ * call-seq:
+ *   max_speech_duration_s = duration_s -> duration_s
+ */
+static VALUE
+ruby_whisper_vad_params_set_max_speech_duration_s(VALUE self, VALUE value)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  rwvp->params.max_speech_duration_s = RFLOAT_VALUE(value);
+  return value;
+}
+
+static VALUE
+ruby_whisper_vad_params_get_max_speech_duration_s(VALUE self)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  return DBL2NUM(rwvp->params.max_speech_duration_s);
+}
+
+/*
+ * Padding added before and after speech segments.
+ *
+ * call-seq:
+ *   speech_pad_ms = pad_ms -> pad_ms
+ */
+static VALUE
+ruby_whisper_vad_params_set_speech_pad_ms(VALUE self, VALUE value)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  rwvp->params.speech_pad_ms = NUM2INT(value);
+  return value;
+}
+
+static VALUE
+ruby_whisper_vad_params_get_speech_pad_ms(VALUE self)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  return INT2NUM(rwvp->params.speech_pad_ms);
+}
+
+/*
+ * Overlap in seconds when copying audio samples from speech segment.
+ *
+ * call-seq:
+ *   samples_overlap = overlap -> overlap
+ */
+static VALUE
+ruby_whisper_vad_params_set_samples_overlap(VALUE self, VALUE value)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  rwvp->params.samples_overlap = RFLOAT_VALUE(value);
+  return value;
+}
+
+static VALUE
+ruby_whisper_vad_params_get_samples_overlap(VALUE self)
+{
+  ruby_whisper_vad_params *rwvp;
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+  return DBL2NUM(rwvp->params.samples_overlap);
+}
+
+static VALUE
+ruby_whisper_vad_params_equal(VALUE self, VALUE other)
+{
+  ruby_whisper_vad_params *rwvp1;
+  ruby_whisper_vad_params *rwvp2;
+
+  if (self == other) {
+    return Qtrue;
+  }
+
+  if (!rb_obj_is_kind_of(other, cVADParams)) {
+    return Qfalse;
+  }
+
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp1);
+  TypedData_Get_Struct(other, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp2);
+
+  if (rwvp1->params.threshold != rwvp2->params.threshold) {
+    return Qfalse;
+  }
+  if (rwvp1->params.min_speech_duration_ms != rwvp2->params.min_speech_duration_ms) {
+    return Qfalse;
+  }
+  if (rwvp1->params.min_silence_duration_ms != rwvp2->params.min_silence_duration_ms) {
+    return Qfalse;
+  }
+  if (rwvp1->params.max_speech_duration_s != rwvp2->params.max_speech_duration_s) {
+    return Qfalse;
+  }
+  if (rwvp1->params.speech_pad_ms != rwvp2->params.speech_pad_ms) {
+    return Qfalse;
+  }
+  if (rwvp1->params.samples_overlap != rwvp2->params.samples_overlap) {
+    return Qfalse;
+  }
+
+  return Qtrue;
+}
+
+#define SET_PARAM_IF_SAME(param_name) \
+  if (id == id_ ## param_name) { \
+    ruby_whisper_vad_params_set_ ## param_name(self, value); \
+    continue; \
+  }
+
+VALUE
+ruby_whisper_vad_params_initialize(int argc, VALUE *argv, VALUE self)
+{
+  VALUE kw_hash;
+  VALUE values[NUM_PARAMS] = {Qundef};
+  VALUE value;
+  ruby_whisper_vad_params *rwvp;
+  ID id;
+  int i;
+
+  TypedData_Get_Struct(self, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+
+  rb_scan_args_kw(RB_SCAN_ARGS_KEYWORDS, argc, argv, ":", &kw_hash);
+  if (NIL_P(kw_hash)) {
+    return self;
+  }
+
+  rb_get_kwargs(kw_hash, param_names, 0, NUM_PARAMS, values);
+
+  for (i = 0; i < NUM_PARAMS; i++) {
+    id = param_names[i];
+    value = values[i];
+    if (value == Qundef) {
+      continue;
+    }
+    SET_PARAM_IF_SAME(threshold)
+    SET_PARAM_IF_SAME(min_speech_duration_ms)
+    SET_PARAM_IF_SAME(min_silence_duration_ms)
+    SET_PARAM_IF_SAME(max_speech_duration_s)
+    SET_PARAM_IF_SAME(speech_pad_ms)
+    SET_PARAM_IF_SAME(samples_overlap)
+  }
+
+  return self;
+}
+
+#undef SET_PARAM_IF_SAME
+
+void
+init_ruby_whisper_vad_params(VALUE *mVAD)
+{
+  cVADParams = rb_define_class_under(*mVAD, "Params", rb_cObject);
+  rb_define_alloc_func(cVADParams, ruby_whisper_vad_params_s_allocate);
+  rb_define_method(cVADParams, "initialize", ruby_whisper_vad_params_initialize, -1);
+
+  DEFINE_PARAM(threshold, 0)
+  DEFINE_PARAM(min_speech_duration_ms, 1)
+  DEFINE_PARAM(min_silence_duration_ms, 2)
+  DEFINE_PARAM(max_speech_duration_s, 3)
+  DEFINE_PARAM(speech_pad_ms, 4)
+  DEFINE_PARAM(samples_overlap, 5)
+
+  rb_define_method(cVADParams, "==", ruby_whisper_vad_params_equal, 1);
+}
+
+#undef DEFINE_PARAM
+#undef NUM_PARAMS
--- a/bindings/ruby/extsources.rb
+++ b/bindings/ruby/extsources.rb
@ -1,5 +1,10 @@
+require "pathname"
+
+root = Pathname("..")/".."
 ignored_dirs = %w[
  .devops
+  .github
+  ci
  examples/wchess/wchess.wasm
  examples/whisper.android
  examples/whisper.android.java
@ -9,7 +14,7 @@ ignored_dirs = %w[
  models
  samples
  scripts
-]
+].collect {|dir| root/dir}
 ignored_files = %w[
  AUTHORS
  Makefile
@ -17,18 +22,19 @@ ignored_files = %w[
  README_sycl.md
  .gitignore
  .gitmodules
+  .dockerignore
  whisper.nvim
  twitch.sh
  yt-wsp.sh
+  close-issue.yml
 ]

 EXTSOURCES =
-  `git ls-files -z ../..`.split("\x0")
-    .select {|file|
-      basename = File.basename(file)
-
-      ignored_dirs.all? {|dir| !file.start_with?("../../#{dir}")} &&
-        !ignored_files.include?(basename) &&
-        (file.start_with?("../..") || file.start_with?("../javascript")) &&
-        (!file.start_with?("../../.github/") || basename == "bindings-ruby.yml")
+  `git ls-files -z #{root}`.split("\x0")
+    .collect {|file| Pathname(file)}
+    .reject {|file|
+      ignored_dirs.any? {|dir| file.descend.any? {|desc| desc == dir}} ||
+        ignored_files.include?(file.basename.to_path) ||
+        (file.descend.to_a[1] != root && file.descend.to_a[1] != Pathname("..")/"javascript")
    }
+    .collect(&:to_path)
--- a/bindings/ruby/lib/whisper/context.rb
+++ b/bindings/ruby/lib/whisper/context.rb
@ -0,0 +1,15 @@
+module Whisper
+  class Context
+    def to_srt
+      each_segment.with_index.reduce("") {|srt, (segment, index)|
+        srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
+      }
+    end
+
+    def to_webvtt
+      each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
+        webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
+      }
+    end
+  end
+end
--- a/bindings/ruby/lib/whisper/model/uri.rb
+++ b/bindings/ruby/lib/whisper/model/uri.rb
@ -55,6 +55,8 @@ module Whisper
            when Net::HTTPNotModified
              # noop
            when Net::HTTPOK
+              return if !response.key?("last-modified") && cache_path.exist?
+
              download response
            when Net::HTTPRedirection
              request URI(response["location"]), headers
@ -128,6 +130,44 @@ module Whisper
      end
    end

+    class ZipURI < URI
+      def cache
+        zip_path = super
+        dest = unzipped_path
+        return if dest.exist? && dest.mtime >= zip_path.mtime
+        escaping dest do
+          system "unzip", "-q", "-d", zip_path.dirname.to_path, zip_path.to_path, exception: true
+        end
+        zip_path
+      end
+
+      def clear_cache
+        super
+        unzipped_path.rmtree if unzipped_path.exist?
+      end
+
+      private
+
+      def unzipped_path
+        cache_path.sub_ext("")
+      end
+
+      def escaping(path)
+        escaped = Pathname("#{path}.removing")
+        if path.exist?
+          escaped.rmtree if escaped.exist?
+          path.rename escaped
+        end
+        yield
+      ensure
+        if path.exist?
+          escaped.rmtree if escaped.exist?
+        else
+          escaped.rename path if escaped.exist?
+        end
+      end
+    end
+
    @pre_converted_models = %w[
      tiny
      tiny.en
@ -163,8 +203,31 @@ module Whisper
      models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
    }

+    %w[
+      silero-v5.1.2
+    ].each do |name|
+      @pre_converted_models[name] = URI.new("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-#{name}.bin")
+    end
+
+    @coreml_compiled_models = %w[
+      tiny
+      tiny.en
+      base
+      base.en
+      small
+      small.en
+      medium
+      medium.en
+      large-v1
+      large-v2
+      large-v3
+      large-v3-turbo
+    ].each_with_object({}) do |name, models|
+      models[@pre_converted_models[name]] = ZipURI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}-encoder.mlmodelc.zip")
+    end
+
    class << self
-      attr_reader :pre_converted_models
+      attr_reader :pre_converted_models, :coreml_compiled_models
    end
  end
 end
--- a/bindings/ruby/lib/whisper/segment.rb
+++ b/bindings/ruby/lib/whisper/segment.rb
@ -0,0 +1,58 @@
+module Whisper
+  class Segment
+    SRT_ESCAPES = {
+      "&" => "&amp;",
+      "<" => "&lt;",
+      ">" => "&gt;",
+    }
+    SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
+    private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
+
+    def to_srt_cue
+      "#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
+    end
+
+    def to_webvtt_cue
+      "#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
+    end
+
+    private
+
+    def time_to_a(time)
+      sec, decimal_part = time.divmod(1000)
+      min, sec = sec.divmod(60)
+      hour, min = min.divmod(60)
+      [hour, min, sec, decimal_part]
+    end
+
+    def srt_time(time)
+      "%02d:%02d:%02d,%03d" % time_to_a(time)
+    end
+
+    def srt_start_time
+      srt_time(start_time)
+    end
+
+    def srt_end_time
+      srt_time(end_time)
+    end
+
+    def srt_text
+      text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
+    end
+
+    def webvtt_time(time)
+      "%02d:%02d:%02d.%03d" % time_to_a(time)
+    end
+
+    def webvtt_start_time
+      webvtt_time(start_time)
+    end
+
+    def webvtt_end_time
+      webvtt_time(end_time)
+    end
+
+    alias webvtt_text srt_text
+  end
+end
--- a/bindings/ruby/sig/whisper.rbs
+++ b/bindings/ruby/sig/whisper.rbs
@ -22,21 +22,22 @@ module Whisper
  def self.lang_str: (Integer id) -> String
  def self.lang_str_full: (Integer id) -> String
  def self.log_set: (log_callback, Object? user_data) -> log_callback
+  def self.system_info_str: () -> String

  class Context
-    def self.new: (path | ::URI::HTTP) -> instance
+    def self.new: (String | path | ::URI::HTTP) -> instance

    # transcribe a single file
    # can emit to a block results
    #
-    #   params = Whisper::Params.new
-    #   params.duration = 60_000
-    #   whisper.transcribe "path/to/audio.wav", params do |text|
-    #     puts text
-    #   end
+    #     params = Whisper::Params.new
+    #     params.duration = 60_000
+    #     whisper.transcribe "path/to/audio.wav", params do |text|
+    #       puts text
+    #     end
    #
-    def transcribe: (string, Params) -> self
-                  | (string, Params) { (String) -> void } -> self
+    def transcribe: (string, Params, ?n_processors: Integer) -> self
+                  | (string, Params, ?n_processors: Integer) { (String) -> void } -> self

    def model_n_vocab: () -> Integer
    def model_n_audio_ctx: () -> Integer
@ -49,16 +50,16 @@ module Whisper

    # Yields each Whisper::Segment:
    #
-    #   whisper.transcribe("path/to/audio.wav", params)
-    #   whisper.each_segment do |segment|
-    #     puts segment.text
-    #   end
+    #     whisper.transcribe("path/to/audio.wav", params)
+    #     whisper.each_segment do |segment|
+    #       puts segment.text
+    #     end
    #
    # Returns an Enumerator if no block given:
    #
-    #   whisper.transcribe("path/to/audio.wav", params)
-    #   enum = whisper.each_segment
-    #   enum.to_a # => [#<Whisper::Segment>, ...]
+    #     whisper.transcribe("path/to/audio.wav", params)
+    #     enum = whisper.each_segment
+    #     enum.to_a # => [#<Whisper::Segment>, ...]
    #
    def each_segment: { (Segment) -> void } -> void
                    | () -> Enumerator[Segment]
@ -73,25 +74,25 @@ module Whisper

    # Start time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
    #
-    #   full_get_segment_t0(3) # => 1668 (16680 ms)
+    #     full_get_segment_t0(3) # => 1668 (16680 ms)
    #
    def full_get_segment_t0: (Integer) -> Integer

    # End time of a segment indexed by +segment_index+ in centiseconds (10 times milliseconds).
    #
-    #   full_get_segment_t1(3) # => 1668 (16680 ms)
+    #     full_get_segment_t1(3) # => 1668 (16680 ms)
    #
    def full_get_segment_t1: (Integer) -> Integer

    # Whether the next segment indexed by +segment_index+ is predicated as a speaker turn.
    #
-    #   full_get_segment_speacker_turn_next(3) # => true
+    #     full_get_segment_speacker_turn_next(3) # => true
    #
    def full_get_segment_speaker_turn_next: (Integer) -> (true | false)

    # Text of a segment indexed by +segment_index+.
    #
-    #   full_get_segment_text(3) # => "ask not what your country can do for you, ..."
+    #     full_get_segment_text(3) # => "ask not what your country can do for you, ..."
    #
    def full_get_segment_text: (Integer) -> String

@ -115,6 +116,9 @@ module Whisper
    def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
                     | (Params, _Samples, ?Integer n_samples) -> self
                     | (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
+
+    def to_srt: () -> String
+    def to_webvtt: () -> String
  end

  class Params
@ -150,7 +154,10 @@ module Whisper
      ?encoder_begin_callback: encoder_begin_callback,
      ?encoder_begin_callback_user_data: Object,
      ?abort_callback: abort_callback,
-      ?abort_callback_user_data: Object
+      ?abort_callback_user_data: Object,
+      ?vad: boolish,
+      ?vad_model_path: path | URI,
+      ?vad_params: Whisper::VAD::Params
    ) -> instance

    # params.language = "auto" | "en", etc...
@ -278,9 +285,9 @@ module Whisper

    # Sets new segment callback, called for every newly generated text segment.
    #
-    #   params.new_segment_callback = ->(context, _, n_new, user_data) {
-    #     # ...
-    #   }
+    #     params.new_segment_callback = ->(context, _, n_new, user_data) {
+    #       # ...
+    #     }
    #
    def new_segment_callback=: (new_segment_callback) -> new_segment_callback
    def new_segment_callback: () -> (new_segment_callback | nil)
@ -293,9 +300,9 @@ module Whisper

    # Sets progress callback, called on each progress update.
    #
-    #   params.new_segment_callback = ->(context, _, progress, user_data) {
-    #     # ...
-    #   }
+    #     params.new_segment_callback = ->(context, _, progress, user_data) {
+    #       # ...
+    #     }
    #
    # +progress+ is an Integer between 0 and 100.
    #
@ -323,9 +330,9 @@ module Whisper

    # Sets abort callback, called to check if the process should be aborted.
    #
-    #   params.abort_callback = ->(user_data) {
-    #     # ...
-    #   }
+    #     params.abort_callback = ->(user_data) {
+    #       # ...
+    #     }
    #
    #
    def abort_callback=: (abort_callback) -> abort_callback
@ -338,11 +345,25 @@ module Whisper

    def abort_callback_user_data: () -> Object

+    # Enable VAD
+    #
+    def vad=: (boolish) -> boolish
+
+    def vad: () -> (true | false)
+
+    # Path to the VAD model
+    def vad_model_path=: (path | URI | nil) -> (path | URI | nil)
+
+    def vad_model_path: () -> (String | nil)
+
+    def vad_params=: (Whisper::VAD::Params) -> Whisper::VAD::Params
+    def vad_params: () -> (Whisper::VAD::Params)
+
    # Hook called on new segment. Yields each Whisper::Segment.
    #
-    #   whisper.on_new_segment do |segment|
-    #     # ...
-    #   end
+    #     whisper.on_new_segment do |segment|
+    #       # ...
+    #     end
    #
    def on_new_segment: { (Segment) -> void } -> void

@ -356,19 +377,20 @@ module Whisper

    # Call block to determine whether abort or not. Return +true+ when you want to abort.
    #
-    #   params.abort_on do
-    #     if some_condition
-    #       true # abort
-    #     else
-    #       false # continue
+    #     params.abort_on do
+    #       if some_condition
+    #         true # abort
+    #       else
+    #         false # continue
+    #       end
    #     end
-    #   end
    #
    def abort_on: { (Object user_data) -> boolish } -> void
  end

  class Model
    def self.pre_converted_models: () -> Hash[String, Model::URI]
+    def self.coreml_compiled_models: () -> Hash[Model::URI, Model::ZipURI]
    def self.new: () -> instance
    def n_vocab: () -> Integer
    def n_audio_ctx: () -> Integer
@ -388,9 +410,22 @@ module Whisper
      def to_path: -> String
      def clear_cache: -> void
    end
+
+    class ZipURI < URI
+      def cache: () -> Pathname
+      def clear_cache: () -> void
+    end
  end

  class Segment
+    type deconstructed_keys = {
+      start_time: (Integer | nil),
+      end_time: (Integer | nil),
+      text: (String | nil),
+      no_speech_prob: (Float | nil),
+      speaker_turn_next: (true | false | nil)
+    }
+
    # Start time in milliseconds.
    #
    def start_time: () -> Integer
@ -400,10 +435,70 @@ module Whisper
    def end_time: () -> Integer

    # Whether the next segment is predicted as a speaker turn.
-    def speaker_next_turn?: () -> (true | false)
+    def speaker_turn_next?: () -> (true | false)

    def text: () -> String
    def no_speech_prob: () -> Float
+    def to_srt_cue: () -> String
+    def to_webvtt_cue: () -> String
+
+    #  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
+    #
+    #      whisper.each_segment do |segment|
+    #        segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+    #
+    #        puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
+    #      end
+    def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
+  end
+
+  module VAD
+    class Params
+      def self.new: (
+        ?threshold: Float,
+        ?min_speech_duration_ms: Integer,
+        ?min_silence_duration_ms: Integer,
+        ?max_speech_duration_s: Float,
+        ?speech_pad_ms: Integer,
+        ?samples_overlap: Float
+      ) -> instance
+
+      # Probability threshold to consider as speech.
+      #
+      def threshold=: (Float) -> Float
+
+      def threshold: () -> Float
+
+      # Min duration for a valid speech segment.
+      #
+      def min_speech_duration_ms=: (Integer) -> Integer
+
+      def min_speech_duration_ms: () -> Integer
+
+      # Min silence duration to consider speech as ended.
+      #
+      def min_silence_duration_ms=: (Integer) -> Integer
+
+      def min_silence_duration_ms: () -> Integer
+
+      # Max duration of a speech segment before forcing a new segment.
+      def max_speech_duration_s=: (Float) -> Float
+
+      def max_speech_duration_s: () -> Float
+
+      # Padding added before and after speech segments.
+      #
+      def speech_pad_ms=: (Integer) -> Integer
+
+      def speech_pad_ms: () -> Integer
+
+      # Overlap in seconds when copying audio samples from speech segment.
+      #
+      def samples_overlap=: (Float) -> Float
+
+      def samples_overlap: () -> Float
+      def ==: (Params) -> (true | false)
+    end
  end

  class Error < StandardError
--- a/bindings/ruby/tests/helper.rb
+++ b/bindings/ruby/tests/helper.rb
@ -3,7 +3,7 @@ require "whisper"
 require_relative "jfk_reader/jfk_reader"

 class TestBase < Test::Unit::TestCase
-  AUDIO = File.join(__dir__, "..", "..", "..", "samples", "jfk.wav")
+  AUDIO = File.join(__dir__, "fixtures", "jfk.wav")

  class << self
    def whisper
@ -21,15 +21,4 @@ class TestBase < Test::Unit::TestCase
  def whisper
    self.class.whisper
  end
-
-  module BuildOptions
-    load "ext/options.rb", self
-    Options.include self
-
-    def enable_config(name)
-    end
-
-    def arg_config(name)
-    end
-  end
 end
--- a/bindings/ruby/tests/jfk_reader/.gitignore
+++ b/bindings/ruby/tests/jfk_reader/.gitignore
--- a/bindings/ruby/tests/jfk_reader/extconf.rb
+++ b/bindings/ruby/tests/jfk_reader/extconf.rb
--- a/bindings/ruby/tests/jfk_reader/jfk_reader.c
+++ b/bindings/ruby/tests/jfk_reader/jfk_reader.c
--- a/bindings/ruby/tests/test_callback.rb
+++ b/bindings/ruby/tests/test_callback.rb
--- a/bindings/ruby/tests/test_error.rb
+++ b/bindings/ruby/tests/test_error.rb
--- a/bindings/ruby/tests/test_model.rb
+++ b/bindings/ruby/tests/test_model.rb
@ -106,4 +106,13 @@ class TestModel < TestBase
    assert_equal 1, model.ftype
    assert_equal "base", model.type
  end
+
+  def test_coreml_model_auto_download
+    uri = Whisper::Model.coreml_compiled_models[Whisper::Model.pre_converted_models["tiny"]]
+    model_path = Pathname(uri.to_path).sub_ext("")
+    model_path.rmtree if model_path.exist?
+
+    uri.cache
+    assert_path_exist model_path
+  end
 end
--- a/bindings/ruby/tests/test_package.rb
+++ b/bindings/ruby/tests/test_package.rb
@ -18,12 +18,24 @@ class TestPackage < TestBase
    end

    def test_install
-      match_data = `rake -Tbuild`.match(/(whispercpp-(.+)\.gem)/)
-      filename = match_data[1]
-      version = match_data[2]
+      gemspec = Gem::Specification.load("whispercpp.gemspec")
      Dir.mktmpdir do |dir|
-        system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{filename.shellescape}", exception: true
-        assert_installed dir, version
+        system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{gemspec.file_name.shellescape}", exception: true
+        assert_installed dir, gemspec.version
+      end
+    end
+
+    def test_install_with_coreml
+      omit_unless RUBY_PLATFORM.match?(/darwin/) do
+        gemspec = Gem::Specification.load("whispercpp.gemspec")
+        Dir.mktmpdir do |dir|
+          system "gem", "install", "--install-dir", dir.shellescape, "--no-document", "pkg/#{gemspec.file_name.shellescape}", "--", "--enable-whisper-coreml", exception: true
+          assert_installed dir, gemspec.version
+          assert_nothing_raised do
+            libdir = File.join(dir, "gems", "#{gemspec.name}-#{gemspec.version}", "lib")
+            system "ruby", "-I", libdir, "-r", "whisper", "-e", "Whisper::Context.new('tiny')", exception: true
+          end
+        end
      end
    end

@ -35,12 +47,4 @@ class TestPackage < TestBase
      assert_path_not_exist File.join(dir, "gems/whispercpp-#{version}/ext/build")
    end
  end
-
-  def test_build_options
-    options = BuildOptions::Options.new
-    assert_empty options.missing_options
-    unless ENV["CI"]
-      assert_empty options.extra_options
-    end
-  end
 end
--- a/bindings/ruby/tests/test_params.rb
+++ b/bindings/ruby/tests/test_params.rb
@ -32,6 +32,9 @@ class TestParams < TestBase
    :progress_callback_user_data,
    :abort_callback,
    :abort_callback_user_data,
+    :vad,
+    :vad_model_path,
+    :vad_params,
  ]

  def setup
@ -191,6 +194,50 @@ class TestParams < TestBase
    assert_in_delta 0.2, @params.no_speech_thold
  end

+  def test_vad
+    assert_false @params.vad
+    @params.vad = true
+    assert_true @params.vad
+  end
+
+  def test_vad_model_path
+    assert_nil @params.vad_model_path
+    @params.vad_model_path = "silero-v5.1.2"
+    assert_equal Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path, @params.vad_model_path
+  end
+
+  def test_vad_model_path_with_nil
+    @params.vad_model_path = "silero-v5.1.2"
+    @params.vad_model_path = nil
+    assert_nil @params.vad_model_path
+  end
+
+  def test_vad_model_path_with_invalid
+    assert_raise TypeError do
+      @params.vad_model_path = Object.new
+    end
+  end
+
+  def test_vad_model_path_with_URI_string
+    @params.vad_model_path = "https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin"
+    assert_equal @params.vad_model_path, Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
+  end
+
+  def test_vad_model_path_with_URI
+    @params.vad_model_path = URI("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-silero-v5.1.2.bin")
+    assert_equal @params.vad_model_path, Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
+  end
+
+  def test_vad_params
+    assert_kind_of Whisper::VAD::Params, @params.vad_params
+    default_params = @params.vad_params
+    assert_same default_params, @params.vad_params
+    assert_equal 0.5, default_params.threshold
+    new_params = Whisper::VAD::Params.new
+    @params.vad_params = new_params
+    assert_same new_params, @params.vad_params
+  end
+
  def test_new_with_kw_args
    params = Whisper::Params.new(language: "es")
    assert_equal "es", params.language
@ -225,6 +272,10 @@ class TestParams < TestBase
              proc {}
            in [/_user_data\Z/, *]
              Object.new
+            in [:vad_model_path, *]
+              Whisper::Model.pre_converted_models["silero-v5.1.2"].to_path
+            in [:vad_params, *]
+              Whisper::VAD::Params.new
            end
    params = Whisper::Params.new(param => value)
    if Float === value
--- a/bindings/ruby/test/test_segment.rb
+++ b/bindings/ruby/test/test_segment.rb
@ -0,0 +1,136 @@
+require_relative "helper"
+
+class TestSegment < TestBase
+  def test_iteration
+    whisper.each_segment do |segment|
+      assert_instance_of Whisper::Segment, segment
+    end
+  end
+
+  def test_enumerator
+    enum = whisper.each_segment
+    assert_instance_of Enumerator, enum
+    enum.to_a.each_with_index do |segment, index|
+      assert_instance_of Whisper::Segment, segment
+      assert_kind_of Integer, index
+    end
+  end
+
+  def test_start_time
+    i = 0
+    whisper.each_segment do |segment|
+      assert_equal 0, segment.start_time if i == 0
+      i += 1
+    end
+  end
+
+  def test_end_time
+    i = 0
+    whisper.each_segment do |segment|
+      assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
+      i += 1
+    end
+  end
+
+  def test_no_speech_prob
+    no_speech_prob = nil
+    whisper.each_segment do |segment|
+      no_speech_prob = segment.no_speech_prob
+    end
+    assert no_speech_prob > 0.0
+  end
+
+  def test_on_new_segment
+    params = Whisper::Params.new
+    seg = nil
+    index = 0
+    params.on_new_segment do |segment|
+      assert_instance_of Whisper::Segment, segment
+      if index == 0
+        seg = segment
+        assert_equal 0, segment.start_time
+        assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
+      end
+      index += 1
+    end
+    whisper.transcribe(AUDIO, params)
+    assert_equal 0, seg.start_time
+    assert_match(/ask not what your country can do for you, ask what you can do for your country/, seg.text)
+  end
+
+  def test_on_new_segment_twice
+    params = Whisper::Params.new
+    seg = nil
+    params.on_new_segment do |segment|
+      seg = segment
+      return
+    end
+    params.on_new_segment do |segment|
+      assert_same seg, segment
+      return
+    end
+    whisper.transcribe(AUDIO, params)
+  end
+
+  def test_pattern_matching
+    segment = whisper.each_segment.first
+    segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+
+    assert_equal segment.start_time, start_time
+    assert_equal segment.end_time, end_time
+    assert_equal segment.text, text
+    assert_equal segment.no_speech_prob, no_speech_prob
+    assert_equal segment.speaker_turn_next?, speaker_turn_next
+  end
+
+  def test_pattern_matching_partial
+    segment = whisper.each_segment.first
+    segment => {start_time:, end_time:, text:}
+
+    assert_equal segment.start_time, start_time
+    assert_equal segment.end_time, end_time
+    assert_equal segment.text, text
+  end
+
+  def test_deconstruct_keys
+    segment = whisper.each_segment.first
+    expected = {
+      start_time: segment.start_time,
+      end_time: segment.end_time,
+      text: segment.text,
+      no_speech_prob: segment.no_speech_prob,
+      speaker_turn_next: segment.speaker_turn_next?
+    }
+    assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
+  end
+
+  def test_deconstruct_keys_non_existent
+    omit "Undefined behavior"
+
+    segment = whisper.each_segment.first
+
+    assert_equal({}, segment.deconstruct_keys([:non_existent]))
+  end
+
+  def test_deconstruct_keys_too_many_keys
+    omit "Undefined behavior"
+
+    segment = whisper.each_segment.first
+
+    assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
+  end
+
+  def test_deconstruct_keys_includes_non_existent_keys_not_too_many
+    omit "Undefined behavior"
+
+    segment = whisper.each_segment.first
+
+    expected = {
+      start_time: segment.start_time,
+      end_time: segment.end_time,
+      text: segment.text,
+      no_speech_prob: segment.no_speech_prob
+    }
+    assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
+  end
+end
--- a/bindings/ruby/test/test_vad.rb
+++ b/bindings/ruby/test/test_vad.rb
@ -0,0 +1,19 @@
+require_relative "helper"
+
+class TestVAD < TestBase
+  def setup
+    @whisper = Whisper::Context.new("base.en")
+    vad_params = Whisper::VAD::Params.new
+    @params = Whisper::Params.new(
+      vad: true,
+      vad_model_path: "silero-v5.1.2",
+      vad_params:
+    )
+  end
+
+  def test_transcribe
+    @whisper.transcribe(TestBase::AUDIO, @params) do |text|
+      assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
+    end
+  end
+end
--- a/bindings/ruby/test/test_vad_params.rb
+++ b/bindings/ruby/test/test_vad_params.rb
@ -0,0 +1,103 @@
+require_relative "helper"
+
+class TestVADParams < TestBase
+  PARAM_NAMES = [
+    :threshold,
+    :min_speech_duration_ms,
+    :min_silence_duration_ms,
+    :max_speech_duration_s,
+    :speech_pad_ms,
+    :samples_overlap
+  ]
+
+  def setup
+    @params = Whisper::VAD::Params.new
+  end
+
+  def test_new
+    params = Whisper::VAD::Params.new
+    assert_kind_of Whisper::VAD::Params, params
+  end
+
+  def test_threshold
+    assert_in_delta @params.threshold, 0.5
+    @params.threshold = 0.7
+    assert_in_delta @params.threshold, 0.7
+  end
+
+  def test_min_speech_duration
+    pend
+  end
+
+  def test_min_speech_duration_ms
+    assert_equal 250, @params.min_speech_duration_ms
+    @params.min_speech_duration_ms = 500
+    assert_equal 500, @params.min_speech_duration_ms
+  end
+
+  def test_min_silence_duration_ms
+    assert_equal 100, @params.min_silence_duration_ms
+    @params.min_silence_duration_ms = 200
+    assert_equal 200, @params.min_silence_duration_ms
+  end
+
+  def test_max_speech_duration
+    pend
+  end
+
+  def test_max_speech_duration_s
+    assert @params.max_speech_duration_s >= 10e37 # Defaults to FLT_MAX
+    @params.max_speech_duration_s = 60.0
+    assert_equal 60.0, @params.max_speech_duration_s
+  end
+
+  def test_speech_pad_ms
+    assert_equal 30, @params.speech_pad_ms
+    @params.speech_pad_ms = 50
+    assert_equal 50, @params.speech_pad_ms
+  end
+
+  def test_samples_overlap
+    assert_in_delta @params.samples_overlap, 0.1
+    @params.samples_overlap = 0.5
+    assert_in_delta @params.samples_overlap, 0.5
+  end
+
+  def test_equal
+    assert_equal @params, Whisper::VAD::Params.new
+  end
+
+  def test_new_with_kw_args
+    params = Whisper::VAD::Params.new(threshold: 0.7)
+    assert_in_delta params.threshold, 0.7
+    assert_equal 250, params.min_speech_duration_ms
+  end
+
+  def test_new_with_kw_args_non_existent
+    assert_raise ArgumentError do
+      Whisper::VAD::Params.new(non_existent: "value")
+    end
+  end
+
+  data(PARAM_NAMES.collect {|param| [param, param]}.to_h)
+  def test_new_with_kw_args_default_values(param)
+    default_value = @params.send(param)
+    value = default_value + 1
+    params = Whisper::VAD::Params.new(param => value)
+    if Float === value
+      assert_in_delta value, params.send(param)
+    else
+      assert_equal value, params.send(param)
+    end
+
+    PARAM_NAMES.reject {|name| name == param}.each do |name|
+      expected = @params.send(name)
+      actual = params.send(name)
+      if Float === expected
+        assert_in_delta expected, actual
+      else
+        assert_equal expected, actual
+      end
+    end
+  end
+end
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@ -20,6 +20,24 @@ class TestWhisper < TestBase
    }
  end

+  def test_transcribe_non_parallel
+    @whisper = Whisper::Context.new("base.en")
+    params  = Whisper::Params.new
+
+    @whisper.transcribe(AUDIO, params, n_processors: 1) {|text|
+      assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
+    }
+  end
+
+  def test_transcribe_n_processors
+    @whisper = Whisper::Context.new("base.en")
+    params  = Whisper::Params.new
+
+    @whisper.transcribe(AUDIO, params, n_processors: 4) {|text|
+      assert_match(/ask not what your country can do for you[,.] ask what you can do for your country/i, text)
+    }
+  end
+
  sub_test_case "After transcription" do
    def test_full_n_segments
      assert_equal 1, whisper.full_n_segments
@ -94,6 +112,10 @@ class TestWhisper < TestBase
    end
  end

+  def test_system_info_str
+    assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str)
+  end
+
  def test_log_set
    user_data = Object.new
    logs = []
@ -223,4 +245,48 @@ class TestWhisper < TestBase
      assert_match(/for your country/i, text)
    end
  end
+
+  def test_to_srt
+    whisper = Whisper::Context.new("base.en")
+    whisper.transcribe AUDIO, @params
+
+    lines = whisper.to_srt.lines
+    assert_match(/\A\d+\n/, lines[0])
+    assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1])
+    assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2])
+  end
+
+  def test_to_webvtt
+    whisper = Whisper::Context.new("base.en")
+    whisper.transcribe AUDIO, @params
+
+    lines = whisper.to_webvtt.lines
+    assert_equal "WEBVTT\n", lines[0]
+    assert_equal "\n", lines[1]
+    assert_match(/\A\d+\n/, lines[2])
+    assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3])
+    assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4])
+  end
+
+  sub_test_case "Format needs escape" do
+    def setup
+      @whisper = Whisper::Context.new("base.en")
+      @whisper.transcribe AUDIO, Whisper::Params.new
+      segment = @whisper.each_segment.first
+      segment.define_singleton_method :text do
+        "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country."
+      end
+      @whisper.define_singleton_method :each_segment do
+        Enumerator.new(3) {|yielder| 3.times {yielder << segment}}
+      end
+    end
+
+    def test_to_srt_escape
+      assert_equal "&amp; so my fellow Americans --&gt; ask not what your country can do for you &lt;-- ask what you can do for your country.\n", @whisper.to_srt.lines[2]
+    end
+
+    def test_to_webvtt_escape
+      assert_equal "&amp; so my fellow Americans --&gt; ask not what your country can do for you &lt;-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4]
+    end
+  end
 end
--- a/bindings/ruby/tests/test_segment.rb
+++ b/bindings/ruby/tests/test_segment.rb
@ -1,74 +0,0 @@
-require_relative "helper"
-
-class TestSegment < TestBase
-  def test_iteration
-    whisper.each_segment do |segment|
-      assert_instance_of Whisper::Segment, segment
-    end
-  end
-
-  def test_enumerator
-    enum = whisper.each_segment
-    assert_instance_of Enumerator, enum
-    enum.to_a.each_with_index do |segment, index|
-      assert_instance_of Whisper::Segment, segment
-      assert_kind_of Integer, index
-    end
-  end
-
-  def test_start_time
-    i = 0
-    whisper.each_segment do |segment|
-      assert_equal 0, segment.start_time if i == 0
-      i += 1
-    end
-  end
-
-  def test_end_time
-    i = 0
-    whisper.each_segment do |segment|
-      assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
-      i += 1
-    end
-  end
-
-  def test_no_speech_prob
-    no_speech_prob = nil
-    whisper.each_segment do |segment|
-      no_speech_prob = segment.no_speech_prob
-    end
-    assert no_speech_prob > 0.0
-  end
-
-  def test_on_new_segment
-    params = Whisper::Params.new
-    seg = nil
-    index = 0
-    params.on_new_segment do |segment|
-      assert_instance_of Whisper::Segment, segment
-      if index == 0
-        seg = segment
-        assert_equal 0, segment.start_time
-        assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
-      end
-      index += 1
-    end
-    whisper.transcribe(AUDIO, params)
-    assert_equal 0, seg.start_time
-    assert_match(/ask not what your country can do for you, ask what you can do for your country/, seg.text)
-  end
-
-  def test_on_new_segment_twice
-    params = Whisper::Params.new
-    seg = nil
-    params.on_new_segment do |segment|
-      seg = segment
-      return
-    end
-    params.on_new_segment do |segment|
-      assert_same seg, segment
-      return
-    end
-    whisper.transcribe(AUDIO, params)
-  end
-end
--- a/bindings/ruby/whispercpp.gemspec
+++ b/bindings/ruby/whispercpp.gemspec
@ -3,8 +3,7 @@ require_relative "extsources"
 Gem::Specification.new do |s|
  s.name    = "whispercpp"
  s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
-  s.version = '1.3.2'
-  s.date    = '2025-04-25'
+  s.version = '1.3.3'
  s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
  s.email   = 'todd.fisher@gmail.com'
  s.extra_rdoc_files = ['LICENSE', 'README.md']
@ -21,7 +20,7 @@ Gem::Specification.new do |s|
              }

  s.summary = %q{Ruby whisper.cpp bindings}
-  s.test_files = s.files.select {|file| file.start_with? "tests/"}
+  s.test_files = s.files.select {|file| file.start_with? "test/"}

  s.extensions << 'ext/extconf.rb'
  s.required_ruby_version = '>= 3.1.0'
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -105,6 +105,7 @@ else()
    add_subdirectory(bench)
    add_subdirectory(server)
    add_subdirectory(quantize)
+    add_subdirectory(vad-speech-segments)
    if (WHISPER_SDL2)
        add_subdirectory(stream)
        add_subdirectory(command)
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -17,6 +17,7 @@ const whisperParamsMock = {
  comma_in_time: false,
  translate: true,
  no_timestamps: false,
+  detect_language: false,
  audio_ctx: 0,
  max_len: 0,
  prompt: "",
@ -30,8 +31,9 @@ const whisperParamsMock = {
 describe("Run whisper.node", () => {
    test("it should receive a non-empty value", async () => {
        let result = await whisperAsync(whisperParamsMock);
+      console.log(result);

-        expect(result.length).toBeGreaterThan(0);
+        expect(result['transcription'].length).toBeGreaterThan(0);
    }, 10000);
 });

--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -38,6 +38,7 @@ struct whisper_params {
    bool print_progress = false;
    bool no_timestamps  = false;
    bool no_prints      = false;
+    bool detect_language= false;
    bool use_gpu        = true;
    bool flash_attn     = false;
    bool comma_in_time  = true;
@ -82,7 +83,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
            t1 = whisper_full_get_segment_t1(ctx, i);
        }

-        if (!params.no_timestamps) {
+        if (!params.no_timestamps && !params.no_prints) {
            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
        }

@ -113,12 +114,14 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper

        // colorful print bug
        //
-        const char * text = whisper_full_get_segment_text(ctx, i);
-        printf("%s%s", speaker.c_str(), text);
+        if (!params.no_prints) {
+            const char * text = whisper_full_get_segment_text(ctx, i);
+            printf("%s%s", speaker.c_str(), text);
+        }


        // with timestamps or speakers: each segment on new line
-        if (!params.no_timestamps || params.diarize) {
+        if ((!params.no_timestamps || params.diarize) && !params.no_prints) {
            printf("\n");
        }

@ -128,6 +131,11 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper

 void cb_log_disable(enum ggml_log_level, const char *, void *) {}

+struct whisper_result {
+    std::vector<std::vector<std::string>> segments;
+    std::string language;
+};
+
 class ProgressWorker : public Napi::AsyncWorker {
 public:
    ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env)
@ -158,15 +166,27 @@ class ProgressWorker : public Napi::AsyncWorker {

    void OnOK() override {
        Napi::HandleScope scope(Env());
-        Napi::Object res = Napi::Array::New(Env(), result.size());
-        for (uint64_t i = 0; i < result.size(); ++i) {
+
+        if (params.detect_language) {
+            Napi::Object resultObj = Napi::Object::New(Env());
+            resultObj.Set("language", Napi::String::New(Env(), result.language));
+            Callback().Call({Env().Null(), resultObj});
+        }
+
+        Napi::Object returnObj = Napi::Object::New(Env());
+        if (!result.language.empty()) {
+            returnObj.Set("language", Napi::String::New(Env(), result.language));
+        }
+        Napi::Array transcriptionArray = Napi::Array::New(Env(), result.segments.size());
+        for (uint64_t i = 0; i < result.segments.size(); ++i) {
            Napi::Object tmp = Napi::Array::New(Env(), 3);
            for (uint64_t j = 0; j < 3; ++j) {
-                tmp[j] = Napi::String::New(Env(), result[i][j]);
+                tmp[j] = Napi::String::New(Env(), result.segments[i][j]);
            }
-            res[i] = tmp;
-        }
-        Callback().Call({Env().Null(), res});
+            transcriptionArray[i] = tmp;
+         }
+         returnObj.Set("transcription", transcriptionArray);
+         Callback().Call({Env().Null(), returnObj});
    }

    // Progress callback function - using thread-safe function
@ -183,12 +203,12 @@ class ProgressWorker : public Napi::AsyncWorker {

 private:
    whisper_params params;
-    std::vector<std::vector<std::string>> result;
+    whisper_result result;
    Napi::Env env;
    Napi::ThreadSafeFunction tsfn;

    // Custom run function with progress callback support
-    int run_with_progress(whisper_params &params, std::vector<std::vector<std::string>> &result) {
+    int run_with_progress(whisper_params &params, whisper_result & result) {
        if (params.no_prints) {
            whisper_log_set(cb_log_disable, NULL);
        }
@ -277,7 +297,8 @@ class ProgressWorker : public Napi::AsyncWorker {
                wparams.print_timestamps = !params.no_timestamps;
                wparams.print_special    = params.print_special;
                wparams.translate        = params.translate;
-                wparams.language         = params.language.c_str();
+                wparams.language         = params.detect_language ? "auto" : params.language.c_str();
+                wparams.detect_language  = params.detect_language;
                wparams.n_threads        = params.n_threads;
                wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
                wparams.offset_ms        = params.offset_t_ms;
@ -328,18 +349,22 @@ class ProgressWorker : public Napi::AsyncWorker {
                    return 10;
                }
            }
-    }
+        }

+        if (params.detect_language || params.language == "auto") {
+            result.language = whisper_lang_str(whisper_full_lang_id(ctx));
+        }
        const int n_segments = whisper_full_n_segments(ctx);
-        result.resize(n_segments);
+        result.segments.resize(n_segments);
+
        for (int i = 0; i < n_segments; ++i) {
            const char * text = whisper_full_get_segment_text(ctx, i);
            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

-            result[i].emplace_back(to_timestamp(t0, params.comma_in_time));
-            result[i].emplace_back(to_timestamp(t1, params.comma_in_time));
-            result[i].emplace_back(text);
+            result.segments[i].emplace_back(to_timestamp(t0, params.comma_in_time));
+            result.segments[i].emplace_back(to_timestamp(t1, params.comma_in_time));
+            result.segments[i].emplace_back(text);
        }

        whisper_print_timings(ctx);
@ -364,6 +389,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
  bool flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
  bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
  bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
+  bool detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
  int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
  bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
  int32_t max_len = whisper_params.Get("max_len").As<Napi::Number>();
@ -416,6 +442,7 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
  params.max_context = max_context;
  params.print_progress = print_progress;
  params.prompt = prompt;
+  params.detect_language = detect_language;

  Napi::Function callback = info[1].As<Napi::Function>();
  // Create a new Worker class with progress callback support
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -17,6 +17,7 @@ const whisperParams = {
  comma_in_time: false,
  translate: true,
  no_timestamps: false,
+  detect_language: false,
  audio_ctx: 0,
  max_len: 0,
  progress_callback: (progress) => {
@ -31,6 +32,8 @@ const params = Object.fromEntries(
      const [key, value] = item.slice(2).split("=");
      if (key === "audio_ctx") {
        whisperParams[key] = parseInt(value);
+      } else if (key === "detect_language") {
+        whisperParams[key] = value === "true";
      } else {
        whisperParams[key] = value;
      }
--- a/examples/bench.wasm/CMakeLists.txt
+++ b/examples/bench.wasm/CMakeLists.txt
@ -35,7 +35,7 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    -s INITIAL_MEMORY=2000MB \
    -s TOTAL_MEMORY=2000MB \
    -s FORCE_FILESYSTEM=1 \
-    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
+    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap', 'HEAPU8']\" \
    ${EXTRA_FLAGS} \
    ")

--- a/examples/bench.wasm/README.md
+++ b/examples/bench.wasm/README.md
@ -28,5 +28,10 @@ to the server's HTTP path:
 ```
 # copy the produced page to your HTTP path
 cp bin/bench.wasm/*       /path/to/html/
+cp bin/libbench.js        /path/to/html/
 cp bin/libbench.worker.js /path/to/html/
 ```
+
+> 📝 **Note:** As of Emscripten 3.1.58 (April 2024), separate worker.js files are no
+> longer generated and the worker is embedded in the main JS file. So the worker
+> file will not be geneated for versions later than `3.1.58`.
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -66,13 +66,12 @@ static int whisper_bench_full(const whisper_params & params) {
    cparams.use_gpu    = params.use_gpu;
    cparams.flash_attn = params.flash_attn;

-    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
-
    {
        fprintf(stderr, "\n");
        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
    }

+    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 2;
@ -156,6 +155,8 @@ static int whisper_bench_full(const whisper_params & params) {
 }

 int main(int argc, char ** argv) {
+    ggml_backend_load_all();
+
    whisper_params params;

    if (whisper_params_parse(argc, argv, params) == false) {
--- a/examples/cli/README.md
+++ b/examples/cli/README.md
@ -6,7 +6,8 @@ It can be used as a reference for using the `whisper.cpp` library in other proje
 ```
 ./build/bin/whisper-cli -h

-usage: ./build-pkg/bin/whisper-cli [options] file0.wav file1.wav ...
+usage: ./build/bin/whisper-cli [options] file0 file1 ...
+supported audio formats: flac, mp3, ogg, wav

 options:
  -h,        --help              [default] show this help message and exit
@ -24,6 +25,7 @@ options:
  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
+  -nth N,    --no-speech-thold N [0.60   ] no speech threshold
  -tp,       --temperature N     [0.00   ] The sampling temperature, between 0 and 1
  -tpi,      --temperature-inc N [0.20   ] The increment of temperature, between 0 and 1
  -debug,    --debug-mode        [false  ] enable debug mode (eg. dump log_mel)
@ -50,12 +52,13 @@ options:
  -dl,       --detect-language   [false  ] exit after automatically detecting language
             --prompt PROMPT     [       ] initial prompt (max n_text_ctx/2 tokens)
  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
-  -f FNAME,  --file FNAME        [       ] input WAV file path
+  -f FNAME,  --file FNAME        [       ] input audio file path
  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
  -dtw MODEL --dtw MODEL         [       ] compute token-level timestamps
  -ls,       --log-score         [false  ] log best decoder scores of tokens
  -ng,       --no-gpu            [false  ] disable GPU
  -fa,       --flash-attn        [false  ] flash attention
+  -sns,      --suppress-nst      [false  ] suppress non-speech tokens
  --suppress-regex REGEX         [       ] regular expression matching tokens to suppress
  --grammar GRAMMAR              [       ] GBNF grammar to guide decoding
  --grammar-rule RULE            [       ] top-level GBNF grammar rule name
--- a/examples/cli/cli.cpp
+++ b/examples/cli/cli.cpp
@ -11,6 +11,7 @@
 #include <thread>
 #include <vector>
 #include <cstring>
+#include <cfloat>

 #if defined(_WIN32)
 #ifndef NOMINMAX
@ -19,10 +20,6 @@
 #include <windows.h>
 #endif

-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
 // helper function to replace substrings
 static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    for (size_t pos = 0; ; pos += replace.length()) {
@ -73,6 +70,7 @@ struct whisper_params {
    bool no_prints       = false;
    bool print_special   = false;
    bool print_colors    = false;
+    bool print_confidence= false;
    bool print_progress  = false;
    bool no_timestamps   = false;
    bool log_score       = false;
@ -101,6 +99,16 @@ struct whisper_params {
    std::vector<std::string> fname_out = {};

    grammar_parser::parse_state grammar_parsed;
+
+    // Voice Activity Detection (VAD) parameters
+    bool        vad           = false;
+    std::string vad_model     = "";
+    float       vad_threshold = 0.5f;
+    int         vad_min_speech_duration_ms = 250;
+    int         vad_min_silence_duration_ms = 100;
+    float       vad_max_speech_duration_s = FLT_MAX;
+    int         vad_speech_pad_ms = 30;
+    float       vad_samples_overlap = 0.1f;
 };

 static void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -172,6 +180,7 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
        else if (arg == "-np"   || arg == "--no-prints")       { params.no_prints       = true; }
        else if (arg == "-ps"   || arg == "--print-special")   { params.print_special   = true; }
        else if (arg == "-pc"   || arg == "--print-colors")    { params.print_colors    = true; }
+        else if (                  arg == "--print-confidence"){ params.print_confidence= true; }
        else if (arg == "-pp"   || arg == "--print-progress")  { params.print_progress  = true; }
        else if (arg == "-nt"   || arg == "--no-timestamps")   { params.no_timestamps   = true; }
        else if (arg == "-l"    || arg == "--language")        { params.language        = whisper_param_turn_lowercase(ARGV_NEXT); }
@ -189,6 +198,15 @@ static bool whisper_params_parse(int argc, char ** argv, whisper_params & params
        else if (                  arg == "--grammar")         { params.grammar         = ARGV_NEXT; }
        else if (                  arg == "--grammar-rule")    { params.grammar_rule    = ARGV_NEXT; }
        else if (                  arg == "--grammar-penalty") { params.grammar_penalty = std::stof(ARGV_NEXT); }
+        // Voice Activity Detection (VAD)
+        else if (                  arg == "--vad")                         { params.vad                         = true; }
+        else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = ARGV_NEXT; }
+        else if (arg == "-vt"   || arg == "--vad-threshold")               { params.vad_threshold               = std::stof(ARGV_NEXT); }
+        else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms")  { params.vad_min_speech_duration_ms  = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vsd"  || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms  = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s")   { params.vad_max_speech_duration_s   = std::stof(ARGV_NEXT); }
+        else if (arg == "-vp"   || arg == "--vad-speech-pad-ms")           { params.vad_speech_pad_ms           = std::stoi(ARGV_NEXT); }
+        else if (arg == "-vo"   || arg == "--vad-samples-overlap")         { params.vad_samples_overlap         = std::stof(ARGV_NEXT); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -241,6 +259,7 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
    fprintf(stderr, "  -np,       --no-prints         [%-7s] do not print anything other than the results\n",   params.no_prints ? "true" : "false");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
+    fprintf(stderr, "             --print-confidence  [%-7s] print confidence\n",                               params.print_confidence ? "true" : "false");
    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
@ -258,6 +277,18 @@ static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params
    fprintf(stderr, "  --grammar GRAMMAR              [%-7s] GBNF grammar to guide decoding\n",                 params.grammar.c_str());
    fprintf(stderr, "  --grammar-rule RULE            [%-7s] top-level GBNF grammar rule name\n",               params.grammar_rule.c_str());
    fprintf(stderr, "  --grammar-penalty N            [%-7.1f] scales down logits of nongrammar tokens\n",      params.grammar_penalty);
+    // Voice Activity Detection (VAD) parameters
+    fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
+    fprintf(stderr, "             --vad                           [%-7s] enable Voice Activity Detection (VAD)\n",            params.vad ? "true" : "false");
+    fprintf(stderr, "  -vm FNAME, --vad-model FNAME               [%-7s] VAD model path\n",                                   params.vad_model.c_str());
+    fprintf(stderr, "  -vt N,     --vad-threshold N               [%-7.2f] VAD threshold for speech recognition\n",           params.vad_threshold);
+    fprintf(stderr, "  -vspd N,   --vad-min-speech-duration-ms  N [%-7d] VAD min speech duration (0.0-1.0)\n",                params.vad_min_speech_duration_ms);
+    fprintf(stderr, "  -vsd N,    --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n",      params.vad_min_silence_duration_ms);
+    fprintf(stderr, "  -vmsd N,   --vad-max-speech-duration-s   N [%-7s] VAD max speech duration (auto-split longer)\n",      params.vad_max_speech_duration_s == FLT_MAX ?
+                                                                                                                                  std::string("FLT_MAX").c_str() :
+                                                                                                                                  std::to_string(params.vad_max_speech_duration_s).c_str());
+    fprintf(stderr, "  -vp N,     --vad-speech-pad-ms           N [%-7d] VAD speech padding (extend segments)\n",             params.vad_speech_pad_ms);
+    fprintf(stderr, "  -vo N,     --vad-samples-overlap         N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
    fprintf(stderr, "\n");
 }

@ -358,6 +389,26 @@ static void whisper_print_segment_callback(struct whisper_context * ctx, struct

                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
            }
+        } else if (params.print_confidence) {
+            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                if (params.print_special == false) {
+                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                    if (id >= whisper_token_eot(ctx)) {
+                        continue;
+                    }
+                }
+
+                const char * text = whisper_full_get_token_text(ctx, i, j);
+                const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                int style_idx = 2;     // High confidence - dim
+                if (p < 0.33) {
+                    style_idx = 0;     // Low confidence - inverse (highlighted)
+                } else if (p < 0.66) {
+                    style_idx = 1;     // Medium confidence - underlined
+                }
+                printf("%s%s%s%s", speaker.c_str(), k_styles[style_idx].c_str(), text, "\033[0m");
+            }
        } else {
            const char * text = whisper_full_get_segment_text(ctx, i);

@ -379,15 +430,7 @@ static void whisper_print_segment_callback(struct whisper_context * ctx, struct
    }
 }

-static bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
+static void output_txt(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
@ -402,19 +445,9 @@ static bool output_txt(struct whisper_context * ctx, const char * fname, const w

        fout << speaker << text << "\n";
    }
-
-    return true;
 }

-static bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
+static void output_vtt(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    fout << "WEBVTT\n\n";

    const int n_segments = whisper_full_n_segments(ctx);
@ -434,19 +467,9 @@ static bool output_vtt(struct whisper_context * ctx, const char * fname, const w
        fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
        fout << speaker << text << "\n\n";
    }
-
-    return true;
 }

-static bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
+static void output_srt(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
@ -463,8 +486,6 @@ static bool output_srt(struct whisper_context * ctx, const char * fname, const w
        fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
        fout << speaker << text << "\n\n";
    }
-
-    return true;
 }

 static char * escape_double_quotes_and_backslashes(const char * str) {
@ -530,15 +551,7 @@ static char * escape_double_quotes_in_csv(const char * str) {
    return escaped;
 }

-static bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
+static void output_csv(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    const int n_segments = whisper_full_n_segments(ctx);
    fout << "start,end,";
    if (params.diarize && pcmf32s.size() == 2)
@ -561,14 +574,9 @@ static bool output_csv(struct whisper_context * ctx, const char * fname, const w
        }
        fout << "\"" << text_escaped << "\"\n";
    }
-
-    return true;
 }

-static bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
-    std::ofstream fout(fname);
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
+static void output_score(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
    const int n_segments = whisper_full_n_segments(ctx);
    // fprintf(stderr,"segments: %d\n",n_segments);
    for (int i = 0; i < n_segments; ++i) {
@ -581,16 +589,14 @@ static bool output_score(struct whisper_context * ctx, const char * fname, const
            // fprintf(stderr,"token: %s %f\n",token,probability);
 	    }
    }
-    return true;
 }

-static bool output_json(
+static void output_json(
             struct whisper_context * ctx,
-                         const char * fname,
+                      std::ofstream & fout,
               const whisper_params & params,
-    std::vector<std::vector<float>>   pcmf32s,
-                               bool   full) {
-    std::ofstream fout(fname);
+    std::vector<std::vector<float>>   pcmf32s) {
+    const bool full = params.output_jsn_full;
    int indent = 0;

    auto doindent = [&]() {
@ -670,12 +676,6 @@ static bool output_json(
        end_obj(end);
    };

-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
    start_obj(nullptr);
        value_s("systeminfo", whisper_print_system_info(), false);
        start_obj("model");
@ -749,17 +749,12 @@ static bool output_json(

        end_arr(true);
    end_obj(true);
-    return true;
 }

 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-static bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
+static bool output_wts(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s, const char * fname_inp, float t_sec, const char * fname_out) {
    static const char * font = params.font_path.c_str();

    std::ifstream fin(font);
@ -875,20 +870,12 @@ static bool output_wts(struct whisper_context * ctx, const char * fname, const c

    fout.close();

-    fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
+    fprintf(stderr, "# %s: run 'source %s' to generate karaoke video\n", __func__, fname_out);

    return true;
 }

-static bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
-    std::ofstream fout(fname);
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-
+static void output_lrc(struct whisper_context * ctx, std::ofstream & fout, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
    fout << "[by:whisper.cpp]\n";

    const int n_segments = whisper_full_n_segments(ctx);
@ -916,14 +903,14 @@ static bool output_lrc(struct whisper_context * ctx, const char * fname, const w

        fout <<  '[' << timestamp_lrc << ']' << speaker << text << "\n";
    }
-
-    return true;
 }


 static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }

 int main(int argc, char ** argv) {
+    ggml_backend_load_all();
+
 #if defined(_WIN32)
    // Set the console output code page to UTF-8, while command line arguments
    // are still encoded in the system's code page. In this way, we can print
@ -1003,7 +990,6 @@ int main(int argc, char ** argv) {
    }

    // whisper init
-
    struct whisper_context_params cparams = whisper_context_default_params();

    cparams.use_gpu    = params.use_gpu;
@ -1066,8 +1052,55 @@ int main(int argc, char ** argv) {
    }

    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
-        const auto fname_inp = params.fname_inp[f];
-		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
+        const auto & fname_inp = params.fname_inp[f];
+        struct fout_factory {
+            std::string fname_out;
+            const size_t basename_length;
+            const bool is_stdout;
+            bool used_stdout;
+            decltype(whisper_print_segment_callback) * const print_segment_callback;
+            std::ofstream fout;
+
+            fout_factory (const std::string & fname_out_, const std::string & fname_inp, whisper_params & params) :
+                    fname_out{!fname_out_.empty() ? fname_out_ : fname_inp},
+                    basename_length{fname_out.size()},
+                    is_stdout{fname_out == "-"},
+                    used_stdout{},
+                    print_segment_callback{is_stdout ? nullptr : whisper_print_segment_callback} {
+                if (!print_segment_callback) {
+                    params.print_progress = false;
+                }
+            }
+
+            bool open(const char * ext, const char * function) {
+                if (is_stdout) {
+                    if (used_stdout) {
+                        fprintf(stderr, "warning: Not appending multiple file formats to stdout\n");
+                        return false;
+                    }
+
+                    used_stdout = true;
+#ifdef _WIN32
+                    fout = std::ofstream{"CON"};
+#else
+                    fout = std::ofstream{"/dev/stdout"};
+#endif
+                    // Not using fprintf stderr here because it might equal stdout
+                    // Also assuming /dev is mounted
+                    return true;
+                }
+
+                fname_out.resize(basename_length);
+                fname_out += ext;
+                fout = std::ofstream{fname_out};
+                if (!fout.is_open()) {
+                    fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+                    return false;
+                }
+                fprintf(stderr, "%s: saving output to '%s'\n", function, fname_out.c_str());
+                return true;
+            }
+        } fout_factory{f < (int) params.fname_out.size() ? params.fname_out[f] : "", fname_inp, params};

        std::vector<float> pcmf32;               // mono-channel F32 PCM
        std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
@ -1104,6 +1137,11 @@ int main(int argc, char ** argv) {
                    params.tinydiarize ? "tdrz = 1, " : "",
                    params.no_timestamps ? 0 : 1);

+            if (params.print_colors) {
+                fprintf(stderr, "%s: color scheme: red (low confidence), yellow (medium), green (high confidence)\n", __func__);
+            } else if (params.print_confidence) {
+                fprintf(stderr, "%s: confidence: highlighted (low confidence), underlined (medium), dim (high confidence)\n", __func__);
+            }
            fprintf(stderr, "\n");
        }

@ -1154,6 +1192,16 @@ int main(int argc, char ** argv) {

            wparams.suppress_nst     = params.suppress_nst;

+            wparams.vad            = params.vad;
+            wparams.vad_model_path = params.vad_model.c_str();
+
+            wparams.vad_params.threshold               = params.vad_threshold;
+            wparams.vad_params.min_speech_duration_ms  = params.vad_min_speech_duration_ms;
+            wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
+            wparams.vad_params.max_speech_duration_s   = params.vad_max_speech_duration_s;
+            wparams.vad_params.speech_pad_ms           = params.vad_speech_pad_ms;
+            wparams.vad_params.samples_overlap         = params.vad_samples_overlap;
+
            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };

            const auto & grammar_parsed = params.grammar_parsed;
@ -1172,7 +1220,7 @@ int main(int argc, char ** argv) {

            // this callback is called on each new segment
            if (!wparams.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment_callback;
+                wparams.new_segment_callback           = fout_factory.print_segment_callback;
                wparams.new_segment_callback_user_data = &user_data;
            }

@ -1214,54 +1262,26 @@ int main(int argc, char ** argv) {

        // output stuff
        {
-            printf("\n");
+            // macros to stringify function name
+#define output_func(func, ext, param, ...) if (param && fout_factory.open(ext, #func)) {\
+    func(ctx, fout_factory.fout, params, __VA_ARGS__); \
+}
+#define output_ext(ext, ...) output_func(output_##ext, "." #ext, params.output_##ext, __VA_ARGS__)

-            // output to text file
-            if (params.output_txt) {
-                const auto fname_txt = fname_out + ".txt";
-                output_txt(ctx, fname_txt.c_str(), params, pcmf32s);
-            }
+            output_ext(txt, pcmf32s);
+            output_ext(vtt, pcmf32s);
+            output_ext(srt, pcmf32s);
+            output_ext(wts, pcmf32s, fname_inp.c_str(), float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE, fout_factory.fname_out.c_str());
+            output_ext(csv, pcmf32s);
+            output_func(output_json, ".json", params.output_jsn, pcmf32s);
+            output_ext(lrc, pcmf32s);
+            output_func(output_score, ".score.txt", params.log_score, pcmf32s);

-            // output to VTT file
-            if (params.output_vtt) {
-                const auto fname_vtt = fname_out + ".vtt";
-                output_vtt(ctx, fname_vtt.c_str(), params, pcmf32s);
-            }
+#undef output_ext
+#undef output_func

-            // output to SRT file
-            if (params.output_srt) {
-                const auto fname_srt = fname_out + ".srt";
-                output_srt(ctx, fname_srt.c_str(), params, pcmf32s);
-            }
-
-            // output to WTS file
-            if (params.output_wts) {
-                const auto fname_wts = fname_out + ".wts";
-                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE, pcmf32s);
-            }
-
-            // output to CSV file
-            if (params.output_csv) {
-                const auto fname_csv = fname_out + ".csv";
-                output_csv(ctx, fname_csv.c_str(), params, pcmf32s);
-            }
-
-            // output to JSON file
-            if (params.output_jsn) {
-                const auto fname_jsn = fname_out + ".json";
-                output_json(ctx, fname_jsn.c_str(), params, pcmf32s, params.output_jsn_full);
-            }
-
-            // output to LRC file
-            if (params.output_lrc) {
-                const auto fname_lrc = fname_out + ".lrc";
-                output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s);
-            }
-
-            // output to score file
-            if (params.log_score) {
-                const auto fname_score = fname_out + ".score.txt";
-                output_score(ctx, fname_score.c_str(), params, pcmf32s);
+            if (fout_factory.is_stdout && !fout_factory.used_stdout) {
+                fprintf(stderr, "warning: '--output-file -' used without any other '--output-*'");
            }
        }
    }
--- a/examples/command.wasm/CMakeLists.txt
+++ b/examples/command.wasm/CMakeLists.txt
@ -36,7 +36,7 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    -s INITIAL_MEMORY=1024MB \
    -s TOTAL_MEMORY=1024MB \
    -s FORCE_FILESYSTEM=1 \
-    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
+    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap', 'HEAPU8']\" \
    ${EXTRA_FLAGS} \
    ")

--- a/examples/command.wasm/README.md
+++ b/examples/command.wasm/README.md
@ -28,5 +28,10 @@ To run the example in a different server, you need to copy the following files
 to the server's HTTP path:
 ```
 cp bin/command.wasm/*       /path/to/html/
+cp bin/libcommand.js        /path/to/html/
 cp bin/libcommand.worker.js /path/to/html/
 ```
+
+> 📝 **Note:** As of Emscripten 3.1.58 (April 2024), separate worker.js files are no
+> longer generated and the worker is embedded in the main JS file. So the worker
+> file will not be geneated for versions later than `3.1.58`.
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -251,7 +251,7 @@ static std::vector<std::string> get_words(const std::string &txt) {

 // command-list mode
 // guide the transcription to match the most likely command from a provided list
-static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
+static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::ofstream &fout) {
    fprintf(stderr, "\n");
    fprintf(stderr, "%s: guided mode\n", __func__);

@ -444,12 +444,16 @@ static int process_command_list(struct whisper_context * ctx, audio_async &audio

                    const float prob = probs_id[0].first;
                    const int index = probs_id[0].second;
+                    const char * best_command = allowed_commands[index].c_str();

                    fprintf(stdout, "\n");
                    fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
-                            "\033[1m", allowed_commands[index].c_str(), "\033[0m", prob,
+                            "\033[1m", best_command, "\033[0m", prob,
                            (int) std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count());
                    fprintf(stdout, "\n");
+                    if (fout.is_open()) {
+                        fout << best_command << std::endl;
+                    }
                }
            }

@ -462,7 +466,7 @@ static int process_command_list(struct whisper_context * ctx, audio_async &audio

 // always-prompt mode
 // transcribe the voice into text after valid prompt
-static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
+static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params, std::ofstream & fout) {
    bool is_running = true;
    bool ask_prompt = true;

@ -528,6 +532,9 @@ static int always_prompt_transcription(struct whisper_context * ctx, audio_async

                if ((sim > 0.7f) && (command.size() > 0)) {
                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
+                    if (fout.is_open()) {
+                        fout << command << std::endl;
+                    }
                }

                fprintf(stdout, "\n");
@ -542,7 +549,7 @@ static int always_prompt_transcription(struct whisper_context * ctx, audio_async

 // general-purpose mode
 // freely transcribe the voice into text
-static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
+static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params, std::ofstream & fout) {
    bool is_running  = true;
    bool have_prompt = false;
    bool ask_prompt  = true;
@ -662,8 +669,10 @@ static int process_general_transcription(struct whisper_context * ctx, audio_asy
                    } else {
                        // cut the prompt from the decoded text
                        const std::string command = ::trim(txt.substr(best_len));
-
                        fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
+                        if (fout.is_open()) {
+                            fout << command << std::endl;
+                        }
                    }

                    fprintf(stdout, "\n");
@ -678,6 +687,8 @@ static int process_general_transcription(struct whisper_context * ctx, audio_asy
 }

 int main(int argc, char ** argv) {
+    ggml_backend_load_all();
+
    whisper_params params;

    if (whisper_params_parse(argc, argv, params) == false) {
@ -698,6 +709,10 @@ int main(int argc, char ** argv) {
    cparams.flash_attn = params.flash_attn;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
+    if (ctx == nullptr) {
+        fprintf(stderr, "error: failed to initialize whisper context\n");
+        return 2;
+    }

    // print some info about the processing
    {
@ -757,13 +772,22 @@ int main(int argc, char ** argv) {
        }
    }

+    std::ofstream fout;
+    if (params.fname_out.length() > 0) {
+        fout.open(params.fname_out);
+        if (!fout.is_open()) {
+            fprintf(stderr, "%s: failed to open output file '%s'!\n", __func__, params.fname_out.c_str());
+            return 1;
+        }
+    }
+
    if (ret_val == 0) {
        if (!params.commands.empty()) {
-            ret_val = process_command_list(ctx, audio, params);
+            ret_val = process_command_list(ctx, audio, params, fout);
        } else if (!params.prompt.empty() && params.grammar_parsed.rules.empty()) {
-            ret_val = always_prompt_transcription(ctx, audio, params);
+            ret_val = always_prompt_transcription(ctx, audio, params, fout);
        } else {
-            ret_val = process_general_transcription(ctx, audio, params);
+            ret_val = process_general_transcription(ctx, audio, params, fout);
        }
    }

--- a/examples/common-whisper.cpp
+++ b/examples/common-whisper.cpp
@ -26,10 +26,6 @@
 #define MINIAUDIO_IMPLEMENTATION
 #include "miniaudio.h"

-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
 #ifdef _WIN32
 #include <fcntl.h>
 #include <io.h>
@ -116,13 +112,20 @@ bool read_audio_data(const std::string & fname, std::vector<float>& pcmf32, std:
    }

    if (stereo) {
-		pcmf32s.resize(2);
-		pcmf32s[0].resize(frame_count);
-		pcmf32s[1].resize(frame_count);
-		for (uint64_t i = 0; i < frame_count; i++) {
-			pcmf32s[0][i] = pcmf32[2*i];
-			pcmf32s[1][i] = pcmf32[2*i + 1];
-		}
+        std::vector<float> stereo_data = pcmf32;
+        pcmf32.resize(frame_count);
+
+        for (uint64_t i = 0; i < frame_count; i++) {
+            pcmf32[i] = (stereo_data[2*i] + stereo_data[2*i + 1]);
+        }
+
+        pcmf32s.resize(2);
+        pcmf32s[0].resize(frame_count);
+        pcmf32s[1].resize(frame_count);
+        for (uint64_t i = 0; i < frame_count; i++) {
+            pcmf32s[0][i] = stereo_data[2*i];
+            pcmf32s[1][i] = stereo_data[2*i + 1];
+        }
    }

    ma_decoder_uninit(&decoder);
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -10,10 +10,6 @@
 #include <regex>
 #include <sstream>

-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
 // Function to check if the next argument exists
 static std::string get_next_arg(int& i, int argc, char** argv, const std::string& flag, gpt_params& params) {
    if (i + 1 < argc && argv[i + 1][0] != '-') {
--- a/examples/common.h
+++ b/examples/common.h
@ -283,7 +283,7 @@ static std::string set_xterm256_foreground(int r, int g, int b) {
 }

 // Lowest is red, middle is yellow, highest is green. Color scheme from
-// Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
+// Paul Tol; it is colorblind friendly https://sronpersonalpages.nl/~pault
 const std::vector<std::string> k_colors = {
    set_xterm256_foreground(220,   5,  12),
    set_xterm256_foreground(232,  96,  28),
@ -294,6 +294,26 @@ const std::vector<std::string> k_colors = {
    set_xterm256_foreground( 78, 178, 101),
 };

+// ANSI formatting codes
+static std::string set_inverse() {
+    return "\033[7m";
+}
+
+static std::string set_underline() {
+    return "\033[4m";
+}
+
+static std::string set_dim() {
+    return "\033[2m";
+}
+
+// Style scheme for different confidence levels
+const std::vector<std::string> k_styles = {
+    set_inverse(),   // Low confidence - inverse (highlighted)
+    set_underline(), // Medium confidence - underlined
+    set_dim(),       // High confidence - dim
+};
+
 //
 // Other utils
 //
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -424,6 +424,8 @@ static void process_loop(struct whisper_context * ctx, audio_async &audio, const
 }

 int main(int argc, char ** argv) {
+    ggml_backend_load_all();
+
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -1,4 +1,5 @@
 #include "ggml.h"
+#include "ggml-backend.h"

 #include "common.h"
 #include "common-ggml.h"
@ -176,6 +177,8 @@ static bool whisper_model_quantize(const std::string & fname_inp, const std::str
 }

 int main(int argc, char ** argv) {
+    ggml_backend_load_all();
+
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
        ggml_print_ftypes(stderr);
--- a/examples/server/CMakeLists.txt
+++ b/examples/server/CMakeLists.txt
@ -1,3 +1,6 @@
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
 set(TARGET whisper-server)
 add_executable(${TARGET} server.cpp httplib.h)

--- a/examples/server/README.md
+++ b/examples/server/README.md
@ -23,6 +23,7 @@ options:
  -sow,      --split-on-word     [false  ] split on word rather than on token
  -bo N,     --best-of N         [2      ] number of best candidates to keep
  -bs N,     --beam-size N       [-1     ] beam size for beam search
+  -ac N,     --audio-ctx N       [0      ] audio context size (0 - all)
  -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
  -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
  -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
@ -41,9 +42,28 @@ options:
             --prompt PROMPT     [       ] initial prompt
  -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
  -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
+  -dtw MODEL --dtw MODEL         [       ] compute token-level timestamps
  --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
  --port PORT,                   [8080   ] Port number for the server
+  --public PATH,                 [examples/server/public] Path to the public folder
+  --request-path PATH,           [       ] Request path for all requests
+  --inference-path PATH,         [/inference] Inference path for all requests
  --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
+  -sns,      --suppress-nst      [false  ] suppress non-speech tokens
+  -nth N,    --no-speech-thold N [0.60   ] no speech threshold
+  -nc,       --no-context        [false  ] do not use previous audio context
+  -ng,       --no-gpu            [false  ] do not use gpu
+  -fa,       --flash-attn        [false  ] flash attention
+
+Voice Activity Detection (VAD) options:
+             --vad                           [false  ] enable Voice Activity Detection (VAD)
+  -vm FNAME, --vad-model FNAME               [       ] VAD model path
+  -vt N,     --vad-threshold N               [0.50   ] VAD threshold for speech recognition
+  -vspd N,   --vad-min-speech-duration-ms  N [250    ] VAD min speech duration (0.0-1.0)
+  -vsd N,    --vad-min-silence-duration-ms N [100    ] VAD min silence duration (to split segments)
+  -vmsd N,   --vad-max-speech-duration-s   N [FLT_MAX] VAD max speech duration (auto-split longer)
+  -vp N,     --vad-speech-pad-ms           N [30     ] VAD speech padding (extend segments)
+  -vo N,     --vad-samples-overlap         N [0.10   ] VAD samples overlap (seconds between segments)
 ```

 > [!WARNING]
@ -67,3 +87,35 @@ curl 127.0.0.1:8080/load \
 -H "Content-Type: multipart/form-data" \
 -F model="<path-to-model-file>"
 ```
+
+## Load testing with k6
+
+> **Note:** Install [k6](https://k6.io/docs/get-started/installation/) before running the benchmark script.
+
+You can benchmark the Whisper server using the provided bench.js script with [k6](https://k6.io/). This script sends concurrent multipart requests to the /inference endpoint and is fully configurable via environment variables.
+
+**Example usage:**
+
+```
+k6 run bench.js \
+  --env FILE_PATH=/absolute/path/to/samples/jfk.wav \
+  --env BASE_URL=http://127.0.0.1:8080 \
+  --env ENDPOINT=/inference \
+  --env CONCURRENCY=4 \
+  --env TEMPERATURE=0.0 \
+  --env TEMPERATURE_INC=0.2 \
+  --env RESPONSE_FORMAT=json
+```
+
+**Environment variables:**
+- `FILE_PATH`: Path to the audio file to send (must be absolute or relative to the k6 working directory)
+- `BASE_URL`: Server base URL (default: `http://127.0.0.1:8080`)
+- `ENDPOINT`: API endpoint (default: `/inference`)
+- `CONCURRENCY`: Number of concurrent requests (default: 4)
+- `TEMPERATURE`: Decoding temperature (default: 0.0)
+- `TEMPERATURE_INC`: Temperature increment (default: 0.2)
+- `RESPONSE_FORMAT`: Response format (default: `json`)
+
+**Note:**
+- The server must be running and accessible at the specified `BASE_URL` and `ENDPOINT`.
+- The script is located in the same directory as this README: `bench.js`.
--- a/examples/server/bench.js
+++ b/examples/server/bench.js
@ -0,0 +1,29 @@
+import http from 'k6/http'
+import { check } from 'k6'
+
+export let options = {
+  vus: parseInt(__ENV.CONCURRENCY) || 4,
+  iterations: parseInt(__ENV.CONCURRENCY) || 4,
+}
+
+const filePath        = __ENV.FILE_PATH
+const baseURL         = __ENV.BASE_URL        || 'http://127.0.0.1:8080'
+const endpoint        = __ENV.ENDPOINT        || '/inference'
+const temperature     = __ENV.TEMPERATURE     || '0.0'
+const temperatureInc  = __ENV.TEMPERATURE_INC || '0.2'
+const responseFormat  = __ENV.RESPONSE_FORMAT || 'json'
+
+// Read the file ONCE at init time
+const fileBin = open(filePath, 'b')
+
+export default function () {
+  const payload = {
+    file:           http.file(fileBin, filePath),
+    temperature:    temperature,
+    temperature_inc: temperatureInc,
+    response_format: responseFormat,
+  }
+
+  const res = http.post(`${baseURL}${endpoint}`, payload)
+  check(res, { 'status is 200': r => r.status === 200 })
+} 
--- a/examples/server/httplib.h
+++ b/examples/server/httplib.h
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -5,6 +5,7 @@
 #include "httplib.h"
 #include "json.hpp"

+#include <cfloat>
 #include <chrono>
 #include <cmath>
 #include <cstdio>
@ -13,14 +14,23 @@
 #include <string>
 #include <thread>
 #include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
+#include <memory>
+#include <csignal>
+#include <atomic>
+#include <functional>
+#include <cstdlib>
+#if defined (_WIN32)
+#include <windows.h>
 #endif

 using namespace httplib;
 using json = nlohmann::ordered_json;

+enum server_state {
+    SERVER_STATE_LOADING_MODEL,  // Server is starting up, model not fully loaded yet
+    SERVER_STATE_READY,          // Server is ready and model is loaded
+};
+
 namespace {

 // output formats
@ -30,6 +40,20 @@ const std::string srt_format    = "srt";
 const std::string vjson_format  = "verbose_json";
 const std::string vtt_format    = "vtt";

+std::function<void(int)> shutdown_handler;
+std::atomic_flag is_terminating = ATOMIC_FLAG_INIT;
+
+inline void signal_handler(int signal) {
+    if (is_terminating.test_and_set()) {
+        // in case it hangs, we can force terminate the server by hitting Ctrl+C twice
+        // this is for better developer experience, we can remove when the server is stable enough
+        fprintf(stderr, "Received second interrupt, terminating immediately.\n");
+        exit(1);
+    }
+
+    shutdown_handler(signal);
+}
+
 struct server_params
 {
    std::string hostname = "127.0.0.1";
@ -94,6 +118,16 @@ struct whisper_params {
    std::string openvino_encode_device = "CPU";

    std::string dtw = "";
+
+    // Voice Activity Detection (VAD) parameters
+    bool        vad           = false;
+    std::string vad_model     = "";
+    float       vad_threshold = 0.5f;
+    int         vad_min_speech_duration_ms = 250;
+    int         vad_min_silence_duration_ms = 100;
+    float       vad_max_speech_duration_s = FLT_MAX;
+    int         vad_speech_pad_ms = 30;
+    float       vad_samples_overlap = 0.1f;
 };

 void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
@ -142,6 +176,20 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -sns,      --suppress-nst      [%-7s] suppress non-speech tokens\n", params.suppress_nst ? "true" : "false");
    fprintf(stderr, "  -nth N,    --no-speech-thold N [%-7.2f] no speech threshold\n",   params.no_speech_thold);
    fprintf(stderr, "  -nc,       --no-context        [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
+    fprintf(stderr, "  -ng,       --no-gpu            [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
+    fprintf(stderr, "  -fa,       --flash-attn        [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
+    // Voice Activity Detection (VAD) parameters
+    fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
+    fprintf(stderr, "             --vad                           [%-7s] enable Voice Activity Detection (VAD)\n",            params.vad ? "true" : "false");
+    fprintf(stderr, "  -vm FNAME, --vad-model FNAME               [%-7s] VAD model path\n",                                   params.vad_model.c_str());
+    fprintf(stderr, "  -vt N,     --vad-threshold N               [%-7.2f] VAD threshold for speech recognition\n",           params.vad_threshold);
+    fprintf(stderr, "  -vspd N,   --vad-min-speech-duration-ms  N [%-7d] VAD min speech duration (0.0-1.0)\n",                params.vad_min_speech_duration_ms);
+    fprintf(stderr, "  -vsd N,    --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n",      params.vad_min_silence_duration_ms);
+    fprintf(stderr, "  -vmsd N,   --vad-max-speech-duration-s   N [%-7s] VAD max speech duration (auto-split longer)\n",      params.vad_max_speech_duration_s == FLT_MAX ?
+                                                                                                                                  std::string("FLT_MAX").c_str() :
+                                                                                                                                  std::to_string(params.vad_max_speech_duration_s).c_str());
+    fprintf(stderr, "  -vp N,     --vad-speech-pad-ms           N [%-7d] VAD speech padding (extend segments)\n",             params.vad_speech_pad_ms);
+    fprintf(stderr, "  -vo N,     --vad-samples-overlap         N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
    fprintf(stderr, "\n");
 }

@ -197,6 +245,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
        else if (                  arg == "--inference-path")  { sparams.inference_path = argv[++i]; }
        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
+
+        // Voice Activity Detection (VAD)
+        else if (                  arg == "--vad")                         { params.vad                         = true; }
+        else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = argv[++i]; }
+        else if (arg == "-vt"   || arg == "--vad-threshold")               { params.vad_threshold               = std::stof(argv[++i]); }
+        else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms")  { params.vad_min_speech_duration_ms  = std::stoi(argv[++i]); }
+        else if (arg == "-vsd"  || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms  = std::stoi(argv[++i]); }
+        else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s")   { params.vad_max_speech_duration_s   = std::stof(argv[++i]); }
+        else if (arg == "-vp"   || arg == "--vad-speech-pad-ms")           { params.vad_speech_pad_ms           = std::stoi(argv[++i]); }
+        else if (arg == "-vo"   || arg == "--vad-samples-overlap")         { params.vad_samples_overlap         = std::stof(argv[++i]); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params, sparams);
@ -513,11 +571,41 @@ void get_req_parameters(const Request & req, whisper_params & params)
    {
        params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
    }
+    if (req.has_file("vad"))
+    {
+        params.vad = parse_str_to_bool(req.get_file_value("vad").content);
+    }
+    if (req.has_file("vad_threshold"))
+    {
+        params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content);
+    }
+    if (req.has_file("vad_min_speech_duration_ms"))
+    {
+        params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content);
+    }
+    if (req.has_file("vad_min_silence_duration_ms"))
+    {
+        params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content);
+    }
+    if (req.has_file("vad_max_speech_duration_s"))
+    {
+        params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content);
+    }
+    if (req.has_file("vad_speech_pad_ms"))
+    {
+        params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content);
+    }
+    if (req.has_file("vad_samples_overlap"))
+    {
+        params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
+    }
 }

 }  // namespace

 int main(int argc, char ** argv) {
+    ggml_backend_load_all();
+
    whisper_params params;
    server_params sparams;

@ -593,6 +681,9 @@ int main(int argc, char ** argv) {
        }
    }

+    std::unique_ptr<httplib::Server> svr = std::make_unique<httplib::Server>();
+    std::atomic<server_state> state{SERVER_STATE_LOADING_MODEL};
+
    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);

    if (ctx == nullptr) {
@ -602,9 +693,10 @@ int main(int argc, char ** argv) {

    // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
    whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);
+    state.store(SERVER_STATE_READY);

-    Server svr;
-    svr.set_default_headers({{"Server", "whisper.cpp"},
+
+    svr->set_default_headers({{"Server", "whisper.cpp"},
                             {"Access-Control-Allow-Origin", "*"},
                             {"Access-Control-Allow-Headers", "content-type, authorization"}});

@ -683,15 +775,15 @@ int main(int argc, char ** argv) {
    whisper_params default_params = params;

    // this is only called if no index.html is found in the public --path
-    svr.Get(sparams.request_path + "/", [&default_content](const Request &, Response &res){
+    svr->Get(sparams.request_path + "/", [&](const Request &, Response &res){
        res.set_content(default_content, "text/html");
        return false;
    });

-    svr.Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
+    svr->Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
    });

-    svr.Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
+    svr->Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
        // acquire whisper model mutex lock
        std::lock_guard<std::mutex> lock(whisper_mutex);

@ -829,6 +921,16 @@ int main(int argc, char ** argv) {

            wparams.suppress_nst     = params.suppress_nst;

+            wparams.vad              = params.vad;
+            wparams.vad_model_path   = params.vad_model.c_str();
+
+            wparams.vad_params.threshold               = params.vad_threshold;
+            wparams.vad_params.min_speech_duration_ms  = params.vad_min_speech_duration_ms;
+            wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
+            wparams.vad_params.max_speech_duration_s   = params.vad_max_speech_duration_s;
+            wparams.vad_params.speech_pad_ms           = params.vad_speech_pad_ms;
+            wparams.vad_params.samples_overlap         = params.vad_samples_overlap;
+
            whisper_print_user_data user_data = { &params, &pcmf32s, 0 };

            // this callback is called on each new segment
@ -842,33 +944,25 @@ int main(int argc, char ** argv) {
                wparams.progress_callback_user_data = &user_data;
            }

-            // examples for abort mechanism
-            // in examples below, we do not abort the processing, but we could if the flag is set to true
-
-            // the callback is called before every encoder run - if it returns false, the processing is aborted
-            {
-                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
-                    bool is_aborted = *(bool*)user_data;
-                    return !is_aborted;
-                };
-                wparams.encoder_begin_callback_user_data = &is_aborted;
-            }
-
-            // the callback is called before every computation - if it returns true, the computation is aborted
-            {
-                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-
-                wparams.abort_callback = [](void * user_data) {
-                    bool is_aborted = *(bool*)user_data;
-                    return is_aborted;
-                };
-                wparams.abort_callback_user_data = &is_aborted;
-            }
+            // tell whisper to abort if the HTTP connection closed
+            wparams.abort_callback = [](void *user_data) {
+                // user_data is a pointer to our Request
+                auto req_ptr = static_cast<const httplib::Request*>(user_data);
+                return req_ptr->is_connection_closed();
+            };
+            wparams.abort_callback_user_data = (void*)&req;

            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
+                // handle failure or early abort
+                if (req.is_connection_closed()) {
+                    // log client disconnect
+                    fprintf(stderr, "client disconnected, aborted processing\n");
+                    res.status = 499; // Client Closed Request (nginx convention)
+                    res.set_content("{\"error\":\"client disconnected\"}", "application/json");
+                    return;
+                }
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
+                res.status = 500; // Internal Server Error
                const std::string error_resp = "{\"error\":\"failed to process audio\"}";
                res.set_content(error_resp, "application/json");
                return;
@ -1005,8 +1099,9 @@ int main(int argc, char ** argv) {
        // reset params to their defaults
        params = default_params;
    });
-    svr.Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
+    svr->Post(sparams.request_path + "/load", [&](const Request &req, Response &res){
        std::lock_guard<std::mutex> lock(whisper_mutex);
+        state.store(SERVER_STATE_LOADING_MODEL);
        if (!req.has_file("model"))
        {
            fprintf(stderr, "error: no 'model' field in the request\n");
@ -1038,18 +1133,25 @@ int main(int argc, char ** argv) {
        // initialize openvino encoder. this has no effect on whisper.cpp builds that don't have OpenVINO configured
        whisper_ctx_init_openvino_encoder(ctx, nullptr, params.openvino_encode_device.c_str(), nullptr);

+        state.store(SERVER_STATE_READY);
        const std::string success = "Load was successful!";
        res.set_content(success, "application/text");

        // check if the model is in the file system
    });

-    svr.Get(sparams.request_path + "/health", [&](const Request &, Response &res){
-        const std::string health_response = "{\"status\":\"ok\"}";
-        res.set_content(health_response, "application/json");
+    svr->Get(sparams.request_path + "/health", [&](const Request &, Response &res){
+        server_state current_state = state.load();
+        if (current_state == SERVER_STATE_READY) {
+            const std::string health_response = "{\"status\":\"ok\"}";
+            res.set_content(health_response, "application/json");
+        } else {
+            res.set_content("{\"status\":\"loading model\"}", "application/json");
+            res.status = 503;
+        }
    });

-    svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
+    svr->set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
        const char fmt[] = "500 Internal Server Error\n%s";
        char buf[BUFSIZ];
        try {
@ -1063,7 +1165,7 @@ int main(int argc, char ** argv) {
        res.status = 500;
    });

-    svr.set_error_handler([](const Request &req, Response &res) {
+    svr->set_error_handler([](const Request &req, Response &res) {
        if (res.status == 400) {
            res.set_content("Invalid request", "text/plain");
        } else if (res.status != 500) {
@ -1073,10 +1175,10 @@ int main(int argc, char ** argv) {
    });

    // set timeouts and change hostname and port
-    svr.set_read_timeout(sparams.read_timeout);
-    svr.set_write_timeout(sparams.write_timeout);
+    svr->set_read_timeout(sparams.read_timeout);
+    svr->set_write_timeout(sparams.write_timeout);

-    if (!svr.bind_to_port(sparams.hostname, sparams.port))
+    if (!svr->bind_to_port(sparams.hostname, sparams.port))
    {
        fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n",
                sparams.hostname.c_str(), sparams.port);
@ -1084,18 +1186,50 @@ int main(int argc, char ** argv) {
    }

    // Set the base directory for serving static files
-    svr.set_base_dir(sparams.public_path);
+    svr->set_base_dir(sparams.public_path);

    // to make it ctrl+clickable:
    printf("\nwhisper server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port);

-    if (!svr.listen_after_bind())
-    {
-        return 1;
-    }
+    shutdown_handler = [&](int signal) {
+        printf("\nCaught signal %d, shutting down gracefully...\n", signal);
+        if (svr) {
+            svr->stop();
+        }
+    };

-    whisper_print_timings(ctx);
-    whisper_free(ctx);
+#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
+    struct sigaction sigint_action;
+    sigint_action.sa_handler = signal_handler;
+    sigemptyset (&sigint_action.sa_mask);
+    sigint_action.sa_flags = 0;
+    sigaction(SIGINT, &sigint_action, NULL);
+    sigaction(SIGTERM, &sigint_action, NULL);
+#elif defined (_WIN32)
+    auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL {
+        return (ctrl_type == CTRL_C_EVENT) ? (signal_handler(SIGINT), true) : false;
+    };
+    SetConsoleCtrlHandler(reinterpret_cast<PHANDLER_ROUTINE>(console_ctrl_handler), true);
+#endif
+
+    // clean up function, to be called before exit
+    auto clean_up = [&]() {
+        whisper_print_timings(ctx);
+        whisper_free(ctx);
+    };
+
+    std::thread t([&] {
+        if (!svr->listen_after_bind()) {
+            fprintf(stderr, "error: server listen failed\n");
+        }
+    });
+
+    svr->wait_until_ready();
+
+    t.join();
+
+
+    clean_up();

    return 0;
 }
--- a/examples/stream.wasm/CMakeLists.txt
+++ b/examples/stream.wasm/CMakeLists.txt
@ -35,7 +35,7 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    -s INITIAL_MEMORY=1024MB \
    -s TOTAL_MEMORY=1024MB \
    -s FORCE_FILESYSTEM=1 \
-    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
+    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap', 'HEAPU8']\" \
    ${EXTRA_FLAGS} \
    ")

--- a/examples/stream.wasm/README.md
+++ b/examples/stream.wasm/README.md
@ -26,5 +26,10 @@ to the server's HTTP path:
 ```
 # copy the produced page to your HTTP path
 cp bin/stream.wasm/*       /path/to/html/
+cp bin/libstream.js        /path/to/html/
 cp bin/libstream.worker.js /path/to/html/
 ```
+
+> 📝 **Note:** As of Emscripten 3.1.58 (April 2024), separate worker.js files are no
+> longer generated and the worker is embedded in the main JS file. So the worker
+> file will not be geneated for versions later than `3.1.58`.
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -116,6 +116,8 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
 }

 int main(int argc, char ** argv) {
+    ggml_backend_load_all();
+
    whisper_params params;

    if (whisper_params_parse(argc, argv, params) == false) {
@ -161,6 +163,10 @@ int main(int argc, char ** argv) {
    cparams.flash_attn = params.flash_attn;

    struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
+    if (ctx == nullptr) {
+        fprintf(stderr, "error: failed to initialize whisper context\n");
+        return 2;
+    }

    std::vector<float> pcmf32    (n_samples_30s, 0.0f);
    std::vector<float> pcmf32_old;
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -16,10 +16,14 @@ if (WHISPER_SDL2)
        llama-hparams.cpp
        llama-impl.cpp
        llama-io.cpp
-        llama-kv-cache.cpp
+        llama-kv-cache-unified.cpp
+        llama-kv-cache-unified-iswa.cpp
+        llama-memory-recurrent.cpp
+        llama-memory-hybrid.cpp
        llama-memory.cpp
        llama-mmap.cpp
        llama-model-loader.cpp
+        llama-model-saver.cpp
        llama-model.cpp
        llama-quant.cpp
        llama-sampling.cpp
--- a/examples/talk-llama/llama-adapter.cpp
+++ b/examples/talk-llama/llama-adapter.cpp
@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
    std::vector<ggml_backend_buffer_type_t> buft_extra;
    {
        auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+        if (!cpu_dev) {
+            throw std::runtime_error(format("%s: no CPU backend found", __func__));
+        }
        auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);

        auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
                LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));

                auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
+                if (!cpu_dev) {
+                    throw std::runtime_error(format("%s: no CPU backend found", __func__));
+                }
                buft = ggml_backend_dev_buffer_type(cpu_dev);

                break;
--- a/examples/talk-llama/llama-arch.cpp
+++ b/examples/talk-llama/llama-arch.cpp
@ -19,6 +19,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_REFACT,           "refact"           },
    { LLM_ARCH_BERT,             "bert"             },
    { LLM_ARCH_NOMIC_BERT,       "nomic-bert"       },
+    { LLM_ARCH_NOMIC_BERT_MOE,   "nomic-bert-moe"   },
+    { LLM_ARCH_NEO_BERT,         "neo-bert"         },
    { LLM_ARCH_JINA_BERT_V2,     "jina-bert-v2"     },
    { LLM_ARCH_BLOOM,            "bloom"            },
    { LLM_ARCH_STABLELM,         "stablelm"         },
@ -71,6 +73,8 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_WAVTOKENIZER_DEC, "wavtokenizer-dec" },
    { LLM_ARCH_PLM,              "plm"              },
    { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
+    { LLM_ARCH_DOTS1,            "dots1"            },
+    { LLM_ARCH_ARCEE,            "arcee"            },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

@ -106,6 +110,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_EXPERT_WEIGHTS_SCALE,              "%s.expert_weights_scale"              },
    { LLM_KV_EXPERT_WEIGHTS_NORM,               "%s.expert_weights_norm"               },
    { LLM_KV_EXPERT_GATING_FUNC,                "%s.expert_gating_func"                },
+    { LLM_KV_MOE_EVERY_N_LAYERS,                "%s.moe_every_n_layers"                },
    { LLM_KV_POOLING_TYPE,                      "%s.pooling_type"                      },
    { LLM_KV_LOGIT_SCALE,                       "%s.logit_scale"                       },
    { LLM_KV_DECODER_START_TOKEN_ID,            "%s.decoder_start_token_id"            },
@ -142,6 +147,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_ATTENTION_SCALE,                        "%s.attention.scale"                        },
    { LLM_KV_ATTENTION_KEY_LENGTH_MLA,               "%s.attention.key_length_mla"               },
    { LLM_KV_ATTENTION_VALUE_LENGTH_MLA,             "%s.attention.value_length_mla"             },
+    { LLM_KV_ATTENTION_LAYER_INDICES,                "%s.attention.layer_indices"                },

    { LLM_KV_ROPE_DIMENSION_COUNT,      "%s.rope.dimension_count"                 },
    { LLM_KV_ROPE_DIMENSION_SECTIONS,   "%s.rope.dimension_sections"              },
@ -172,6 +178,8 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_CONVNEXT_EMBEDDING_LENGTH, "%s.convnext.embedding_length" },
    { LLM_KV_CONVNEXT_BLOCK_COUNT,      "%s.convnext.block_count"      },

+    { LLM_KV_CLASSIFIER_OUTPUT_LABELS, "%s.classifier.output_labels" },
+
    { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
    { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
    { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
@ -190,13 +198,13 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
    { LLM_KV_TOKENIZER_MASK_ID,              "tokenizer.ggml.mask_token_id"            },
    { LLM_KV_TOKENIZER_ADD_BOS,              "tokenizer.ggml.add_bos_token"            },
    { LLM_KV_TOKENIZER_ADD_EOS,              "tokenizer.ggml.add_eos_token"            },
+    { LLM_KV_TOKENIZER_ADD_SEP,              "tokenizer.ggml.add_sep_token"            },
    { LLM_KV_TOKENIZER_ADD_PREFIX,           "tokenizer.ggml.add_space_prefix"         },
    { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      "tokenizer.ggml.remove_extra_whitespaces" },
    { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
    { LLM_KV_TOKENIZER_HF_JSON,              "tokenizer.huggingface.json"              },
    { LLM_KV_TOKENIZER_RWKV,                 "tokenizer.rwkv.world"                    },
    { LLM_KV_TOKENIZER_CHAT_TEMPLATE,        "tokenizer.chat_template"                 },
-    { LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,      "tokenizer.chat_template.%s"              },
    { LLM_KV_TOKENIZER_FIM_PRE_ID,           "tokenizer.ggml.fim_pre_token_id"         },
    { LLM_KV_TOKENIZER_FIM_SUF_ID,           "tokenizer.ggml.fim_suf_token_id"         },
    { LLM_KV_TOKENIZER_FIM_MID_ID,           "tokenizer.ggml.fim_mid_token_id"         },
@ -240,6 +248,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
        },
    },
+    {
+        LLM_ARCH_ARCEE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ROPE_FREQS,      "rope_freqs" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_ATTN_ROT_EMBD,   "blk.%d.attn_rot_embd" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+        },
+    },
    {
        LLM_ARCH_LLAMA4,
        {
@ -446,6 +472,7 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
            { LLM_TENSOR_POS_EMBD,        "position_embd" },
            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
            { LLM_TENSOR_ATTN_Q,          "blk.%d.attn_q" },
            { LLM_TENSOR_ATTN_K,          "blk.%d.attn_k" },
            { LLM_TENSOR_ATTN_V,          "blk.%d.attn_v" },
@ -472,6 +499,39 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
        },
    },
+    {
+        LLM_ARCH_NOMIC_BERT_MOE,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
+            { LLM_TENSOR_TOKEN_TYPES,     "token_types" },
+            { LLM_TENSOR_ATTN_OUT_NORM,   "blk.%d.attn_output_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_LAYER_OUT_NORM,  "blk.%d.layer_output_norm" },
+            { LLM_TENSOR_FFN_GATE,        "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_GATE_INP,    "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+        },
+    },
+    {
+        LLM_ARCH_NEO_BERT,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_QKV,        "blk.%d.attn_qkv" },
+            { LLM_TENSOR_ATTN_OUT,        "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,        "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_DOWN,        "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
+            { LLM_TENSOR_ENC_OUTPUT_NORM, "enc.output_norm" },
+            { LLM_TENSOR_CLS,             "cls" },
+            { LLM_TENSOR_CLS_OUT,         "cls.output" },
+        },
+    },
    {
        LLM_ARCH_JINA_BERT_V2,
        {
@ -1461,6 +1521,9 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_GATE_EXPS,   "blk.%d.ffn_gate_exps" },
            { LLM_TENSOR_FFN_DOWN_EXPS,   "blk.%d.ffn_down_exps" },
            { LLM_TENSOR_FFN_UP_EXPS,     "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,  "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,  "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,    "blk.%d.ffn_up_shexp" },
        },
    },
    {
@ -1530,6 +1593,34 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
        },
    },
+    {
+        LLM_ARCH_DOTS1,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,         "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,        "output_norm" },
+            { LLM_TENSOR_OUTPUT,             "output" },
+            { LLM_TENSOR_ATTN_NORM,          "blk.%d.attn_norm" },
+            { LLM_TENSOR_ATTN_Q,             "blk.%d.attn_q" },
+            { LLM_TENSOR_ATTN_Q_NORM,        "blk.%d.attn_q_norm" },
+            { LLM_TENSOR_ATTN_K,             "blk.%d.attn_k" },
+            { LLM_TENSOR_ATTN_K_NORM,        "blk.%d.attn_k_norm" },
+            { LLM_TENSOR_ATTN_V,             "blk.%d.attn_v" },
+            { LLM_TENSOR_ATTN_OUT,           "blk.%d.attn_output" },
+            { LLM_TENSOR_FFN_NORM,           "blk.%d.ffn_norm" },
+            { LLM_TENSOR_FFN_GATE,           "blk.%d.ffn_gate" },
+            { LLM_TENSOR_FFN_UP,             "blk.%d.ffn_up" },
+            { LLM_TENSOR_FFN_DOWN,           "blk.%d.ffn_down" },
+            { LLM_TENSOR_FFN_GATE_INP,       "blk.%d.ffn_gate_inp" },
+            { LLM_TENSOR_FFN_GATE_EXPS,      "blk.%d.ffn_gate_exps" },
+            { LLM_TENSOR_FFN_DOWN_EXPS,      "blk.%d.ffn_down_exps" },
+            { LLM_TENSOR_FFN_UP_EXPS,        "blk.%d.ffn_up_exps" },
+            { LLM_TENSOR_FFN_GATE_INP_SHEXP, "blk.%d.ffn_gate_inp_shexp" },
+            { LLM_TENSOR_FFN_GATE_SHEXP,     "blk.%d.ffn_gate_shexp" },
+            { LLM_TENSOR_FFN_DOWN_SHEXP,     "blk.%d.ffn_down_shexp" },
+            { LLM_TENSOR_FFN_UP_SHEXP,       "blk.%d.ffn_up_shexp" },
+            { LLM_TENSOR_FFN_EXP_PROBS_B,    "blk.%d.exp_probs_b" },
+        }
+    },
    {
        LLM_ARCH_UNKNOWN,
        {
@ -1681,8 +1772,14 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
 LLM_KV::LLM_KV(llm_arch arch, const char * suffix) : arch(arch), suffix(suffix) {}

 std::string LLM_KV::operator()(llm_kv kv) const {
-    return suffix ? ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch), suffix)
-        : ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
+    std::string name = ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch));
+
+    if (suffix != nullptr) {
+        name += ".";
+        name += suffix;
+    }
+
+    return name;
 }

 std::string LLM_TN_IMPL::str() const {
@ -1721,3 +1818,25 @@ llm_arch llm_arch_from_string(const std::string & name) {
 const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor) {
    return LLM_TENSOR_INFOS.at(tensor);
 }
+
+bool llm_arch_is_recurrent(const llm_arch & arch) {
+    switch (arch) {
+        case LLM_ARCH_MAMBA:
+        case LLM_ARCH_RWKV6:
+        case LLM_ARCH_RWKV6QWEN2:
+        case LLM_ARCH_RWKV7:
+        case LLM_ARCH_ARWKV7:
+            return true;
+        default:
+            return false;
+    }
+}
+
+bool llm_arch_is_hybrid(const llm_arch & arch) {
+    // TODO: There are currently no hybrid models! Once there are, this will be
+    //  the place to identify them
+    switch (arch) {
+        default:
+            return false;
+    }
+}
--- a/examples/talk-llama/llama-arch.h
+++ b/examples/talk-llama/llama-arch.h
@ -23,6 +23,8 @@ enum llm_arch {
    LLM_ARCH_REFACT,
    LLM_ARCH_BERT,
    LLM_ARCH_NOMIC_BERT,
+    LLM_ARCH_NOMIC_BERT_MOE,
+    LLM_ARCH_NEO_BERT,
    LLM_ARCH_JINA_BERT_V2,
    LLM_ARCH_BLOOM,
    LLM_ARCH_STABLELM,
@ -75,6 +77,8 @@ enum llm_arch {
    LLM_ARCH_WAVTOKENIZER_DEC,
    LLM_ARCH_PLM,
    LLM_ARCH_BAILINGMOE,
+    LLM_ARCH_DOTS1,
+    LLM_ARCH_ARCEE,
    LLM_ARCH_UNKNOWN,
 };

@ -110,6 +114,7 @@ enum llm_kv {
    LLM_KV_EXPERT_WEIGHTS_SCALE,
    LLM_KV_EXPERT_WEIGHTS_NORM,
    LLM_KV_EXPERT_GATING_FUNC,
+    LLM_KV_MOE_EVERY_N_LAYERS,
    LLM_KV_POOLING_TYPE,
    LLM_KV_LOGIT_SCALE,
    LLM_KV_DECODER_START_TOKEN_ID,
@ -146,6 +151,7 @@ enum llm_kv {
    LLM_KV_ATTENTION_SCALE,
    LLM_KV_ATTENTION_KEY_LENGTH_MLA,
    LLM_KV_ATTENTION_VALUE_LENGTH_MLA,
+    LLM_KV_ATTENTION_LAYER_INDICES,

    LLM_KV_ROPE_DIMENSION_COUNT,
    LLM_KV_ROPE_DIMENSION_SECTIONS,
@ -188,13 +194,13 @@ enum llm_kv {
    LLM_KV_TOKENIZER_MASK_ID,
    LLM_KV_TOKENIZER_ADD_BOS,
    LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_ADD_SEP,
    LLM_KV_TOKENIZER_ADD_PREFIX,
    LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
    LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
    LLM_KV_TOKENIZER_HF_JSON,
    LLM_KV_TOKENIZER_RWKV,
    LLM_KV_TOKENIZER_CHAT_TEMPLATE,
-    LLM_KV_TOKENIZER_CHAT_TEMPLATE_N,
    LLM_KV_TOKENIZER_FIM_PRE_ID,
    LLM_KV_TOKENIZER_FIM_SUF_ID,
    LLM_KV_TOKENIZER_FIM_MID_ID,
@ -211,6 +217,8 @@ enum llm_kv {
    LLM_KV_CONVNEXT_EMBEDDING_LENGTH,
    LLM_KV_CONVNEXT_BLOCK_COUNT,

+    LLM_KV_CLASSIFIER_OUTPUT_LABELS,
+
    // deprecated:
    LLM_KV_TOKENIZER_PREFIX_ID,
    LLM_KV_TOKENIZER_SUFFIX_ID,
@ -433,3 +441,6 @@ const char * llm_arch_name(llm_arch arch);
 llm_arch llm_arch_from_string(const std::string & name);

 const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
+
+bool llm_arch_is_recurrent(const llm_arch & arch);
+bool llm_arch_is_hybrid   (const llm_arch & arch);
--- a/examples/talk-llama/llama-batch.cpp
+++ b/examples/talk-llama/llama-batch.cpp
--- a/examples/talk-llama/llama-batch.h
+++ b/examples/talk-llama/llama-batch.h
@ -2,87 +2,146 @@

 #include "llama.h"

+#include "llama-cparams.h"
+
 #include <array>
 #include <vector>
+#include <set>
+#include <bitset>
+#include <unordered_map>

-// very similar to llama_batch,
-// but has more metadata about sequences
+// keep this struct lightweight
+// it points to data in `llama_batch_allocr`
 struct llama_ubatch {
    bool equal_seqs;
    // TODO: whole_seqs for embeddings?

-    uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
-    uint32_t n_seq_tokens; // tokens per sequence
-    uint32_t n_seqs;
+    uint32_t n_tokens;     // total tokens (n_seq_tokens * n_seqs)
+    uint32_t n_seq_tokens; // tokens per sequence set
+    uint32_t n_seqs;       // sequence sets in the ubatch
+    uint32_t n_seqs_unq;   // unique sequence ids in the ubatch

-    llama_token  *  token;    // [n_tokens]
-    float        *  embd;     // [n_embd, n_tokens]
-    llama_pos    *  pos;      // [n_tokens]
-    int32_t      *  n_seq_id; // [n_seqs]
-    llama_seq_id ** seq_id;   // [n_seqs]
-    int8_t       *  output;   // [n_tokens]
+    // seq_id_unq: unique sequence ids in the ubatch
+    // seq_idx:    indices of the unique sequence ids in the ubatch in [0, n_seqs_unq)
+    //             used for extracting sequence pooled embeddings
+
+    //                          // size               | idx | val
+    llama_token  *  token;      // [n_tokens]         | i   | id, token
+    float        *  embd;       // [n_embd, n_tokens] | i   | embd
+    llama_pos    *  pos;        // [n_tokens]         | i   | pos
+    int32_t      *  n_seq_id;   // [n_tokens]         | i   | -
+    llama_seq_id ** seq_id;     // [n_tokens]         | s   | s0, s1, seq_id
+    llama_seq_id *  seq_id_unq; // [n_seqs_unq]       | s   | seq_id
+    int32_t      *  seq_idx;    // [LLAMA_MAX_SEQ]    | -   | seq_idx
+    int8_t       *  output;     // [n_tokens]         | i   | -
 };

-struct llama_sbatch_seq {
-    int32_t n_seq_id;
+// a helper for sanitizing, fulfilling and splitting a batch
+class llama_batch_allocr {
+public:
+    llama_batch_allocr(uint32_t n_pos_per_embd);

-    llama_seq_id * seq_id;
+    // sanitize and auto-gen missing data in the input batch
+    // memory is optional. if provided will be used to check for sequence continuity and to determine the positions
+    bool init(
+            const llama_batch & batch_inp,
+            const llama_vocab & vocab,
+            const llama_memory_i * memory,
+            uint32_t n_embd,
+            bool output_all);

-    size_t offset;
-    size_t length;
-};
+    const llama_batch & get_batch() const;

-// sequence-length-aware batch splitting
-struct llama_sbatch {
-    // tokens left in this batch
-    size_t n_tokens;
+    uint32_t get_n_tokens()  const;
+    uint32_t get_n_outputs() const;

-    size_t n_embd;
+    // the array of output indices in the order they were encountered during the ubatch splitting
+    std::vector<int32_t> & get_out_ids();

-    bool logits_all; // TODO: remove once lctx.logits_all is removed too
+    // min/max positions of each sequence in the current ubatch
+    llama_pos seq_pos_min(llama_seq_id seq_id) const;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const;

-    // sorted indices into the batch
-    std::vector<int64_t> ids;
-    // batch indices of the output
-    std::vector<int64_t> out_ids;
-    std::vector<llama_sbatch_seq> seq;
+    // call once before splitting the batch to reset the internal state
+    void split_reset();

-    const llama_batch * batch = nullptr;
+    // simple split, unknown number of sequence sets of unequal lengths
+    llama_ubatch split_simple(uint32_t n_ubatch);

-    // buffers for the ubatch
-    std::vector<llama_token>    ubatch_token;
-    std::vector<float>          ubatch_embd;
-    std::vector<llama_pos>      ubatch_pos;
-    std::vector<int32_t>        ubatch_n_seq_id;
-    std::vector<llama_seq_id *> ubatch_seq_id;
-    std::vector<int8_t>         ubatch_output;
+    // make ubatches of equal-length sequences sets
+    llama_ubatch split_equal(uint32_t n_ubatch);

-    llama_ubatch reserve_ubatch(size_t n_ubatch, bool has_embd = false);
+    // sequence-set-wise split - each ubatch contains a single sequence-set
+    llama_ubatch split_seq(uint32_t n_ubatch);

-    void add_seq_to_ubatch(llama_ubatch & ubatch, llama_sbatch_seq & seq, size_t length);
+    // a helper method for creating a well-defined ubatch of tokens
+    // TODO: support embeddings if needed in the future
+    llama_ubatch ubatch_reserve(uint32_t n_seq_tokens, uint32_t n_seqs);

-    // simple split, unknown number of sequences of unequal lengths
-    llama_ubatch split_simple(size_t n_ubatch);
+private:
+    void clear();

-    // make batches of equal-length sequences
-    llama_ubatch split_equal(size_t n_ubatch);
+    // create the next ubatch based on the provided batch indices (idxs) and the number of sequence sets (n_seqs)
+    // return llama_ubatch.n_tokens == 0 if the entire batch was consumed
+    llama_ubatch ubatch_add(const std::vector<int32_t> & idxs, uint32_t n_seqs, bool equal_seqs);

-    // sequence-wise split
-    llama_ubatch split_seq(size_t n_ubatch);
+    // for debugging, start with LLAMA_BATCH_DEBUG=2
+    void ubatch_print(const llama_ubatch & ubatch, int debug);

-    void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
-};
+    llama_batch batch;

-// temporary allocate memory for the input batch if needed
-struct llama_batch_allocr {
-    struct llama_batch batch;
+    // only for debugging purposes
+    const llama_vocab * vocab;
+
+    // TODO: this is more of a temporary solution until we have a better way to handle multiple positions per token/embd
+    //       ref: https://github.com/ggml-org/llama.cpp/issues/13694#issuecomment-2983871762
+    const uint32_t n_pos_per_embd;
+
+    uint32_t n_embd;
+    uint32_t n_outputs;

    std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
+
    std::vector<llama_pos>      pos;
    std::vector<int32_t>        n_seq_id;
    std::vector<llama_seq_id *> seq_id;
-    std::vector<int8_t>         logits;
+    std::vector<llama_seq_id>   seq_id_unq;
+    std::vector<int32_t>        seq_idx;
+    std::vector<int8_t>         output;

-    // optionally fulfill the batch returned by llama_batch_get_one
-    llama_batch_allocr(struct llama_batch in_batch, llama_pos p0);
+    using pos_set_t = std::set<llama_pos>;
+    using seq_cpl_t = std::vector<bool>;
+
+    std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
+    std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
+
+    using idx_vec_t = std::vector<int32_t>;
+    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
+
+    std::vector<seq_set_t> seq_set; // seq_set[i]: the sequence set of token i
+
+    std::unordered_map<seq_set_t, idx_vec_t> seq_set_map; // the indices at which the sequence set appears
+
+    // batch indices of the output
+    std::vector<int32_t> out_ids;
+
+    // used[i] indicates if token i has already been used in a previous ubatch
+    std::vector<bool> used;
+
+    // llama_ubatch points to this data:
+    struct ubatch {
+        std::vector<llama_token>    token;
+        std::vector<float>          embd;
+        std::vector<llama_pos>      pos;
+        std::vector<int32_t>        n_seq_id;
+        std::vector<llama_seq_id *> seq_id;
+        std::vector<llama_seq_id>   seq_id_unq;
+        std::vector<int32_t>        seq_idx;
+        std::vector<int8_t>         output;
+    };
+
+    // current splitting state:
+    std::vector<ubatch> ubatches;
+
+    int debug;
 };
--- a/examples/talk-llama/llama-chat.cpp
+++ b/examples/talk-llama/llama-chat.cpp
@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "mistral-v3",        LLM_CHAT_TEMPLATE_MISTRAL_V3        },
    { "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
    { "mistral-v7",        LLM_CHAT_TEMPLATE_MISTRAL_V7        },
+    { "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
    { "phi3",              LLM_CHAT_TEMPLATE_PHI_3             },
    { "phi4",              LLM_CHAT_TEMPLATE_PHI_4             },
    { "falcon3",           LLM_CHAT_TEMPLATE_FALCON_3          },
@ -50,8 +51,8 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
    { "deepseek3",         LLM_CHAT_TEMPLATE_DEEPSEEK_3        },
    { "command-r",         LLM_CHAT_TEMPLATE_COMMAND_R         },
    { "llama3",            LLM_CHAT_TEMPLATE_LLAMA_3           },
-    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGML_3         },
-    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGML_4         },
+    { "chatglm3",          LLM_CHAT_TEMPLATE_CHATGLM_3         },
+    { "chatglm4",          LLM_CHAT_TEMPLATE_CHATGLM_4         },
    { "glmedge",           LLM_CHAT_TEMPLATE_GLMEDGE           },
    { "minicpm",           LLM_CHAT_TEMPLATE_MINICPM           },
    { "exaone3",           LLM_CHAT_TEMPLATE_EXAONE_3          },
@ -122,6 +123,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        }
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|end|>")) {
        return LLM_CHAT_TEMPLATE_PHI_3;
+    } else if (tmpl_contains("[gMASK]<sop>")) {
+        return LLM_CHAT_TEMPLATE_CHATGLM_4;
    } else if (tmpl_contains("<|assistant|>") && tmpl_contains("<|user|>")) {
        return tmpl_contains("</s>") ? LLM_CHAT_TEMPLATE_FALCON_3 : LLM_CHAT_TEMPLATE_GLMEDGE;
    } else if (tmpl_contains("<|{{ item['role'] }}|>") && tmpl_contains("<|begin_of_image|>")) {
@ -154,9 +157,7 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_LLAMA_3;
    } else if (tmpl_contains("[gMASK]sop")) {
        // chatglm3-6b
-        return LLM_CHAT_TEMPLATE_CHATGML_3;
-    } else if (tmpl_contains("[gMASK]<sop>")) {
-        return LLM_CHAT_TEMPLATE_CHATGML_4;
+        return LLM_CHAT_TEMPLATE_CHATGLM_3;
    } else if (tmpl_contains(LU8("<用户>"))) {
        // MiniCPM-3B-OpenHermes-2.5-v2-GGUF
        return LLM_CHAT_TEMPLATE_MINICPM;
@ -182,6 +183,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
        return LLM_CHAT_TEMPLATE_BAILING;
    } else if (tmpl_contains("<|header_start|>") && tmpl_contains("<|header_end|>")) {
        return LLM_CHAT_TEMPLATE_LLAMA4;
+    } else if (tmpl_contains("<|endofuserprompt|>")) {
+        return LLM_CHAT_TEMPLATE_DOTS1;
    }
    return LLM_CHAT_TEMPLATE_UNKNOWN;
 }
@ -202,19 +205,20 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|im_start|>assistant\n";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
        // Official mistral 'v7' template
        // See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
+        //      https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
+        const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
        for (auto message : chat) {
            std::string role(message->role);
            std::string content(message->content);
            if (role == "system") {
-                ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
+                ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
            } else if (role == "user") {
-                ss << "[INST] " << content << "[/INST]";
-            }
-            else {
-                ss << " " << content << "</s>";
+                ss << "[INST]" << trailing_space << content << "[/INST]";
+            } else {
+                ss << trailing_space << content << "</s>";
            }
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
@ -329,7 +333,7 @@ int32_t llm_chat_apply_template(
            std::string role(message->role);
            if (role == "system") {
                // there is no system message for gemma, but we will merge it with user prompt, so nothing is broken
-                system_prompt = trim(message->content);
+                system_prompt += trim(message->content);
                continue;
            }
            // in gemma, "assistant" is "model"
@ -351,7 +355,7 @@ int32_t llm_chat_apply_template(
            std::string role(message->role);
            if (role == "system") {
                // there is no system message support, we will merge it with user prompt
-                system_prompt = message->content;
+                system_prompt += message->content;
                continue;
            } else if (role == "user") {
                ss << "Human: ";
@ -437,7 +441,7 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|start_header_id|>assistant<|end_header_id|>\n\n";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_3) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_3) {
        // chatglm3-6b
        ss << "[gMASK]" << "sop";
        for (auto message : chat) {
@ -447,14 +451,14 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "<|assistant|>";
        }
-    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGML_4) {
+    } else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
        ss << "[gMASK]" << "<sop>";
        for (auto message : chat) {
            std::string role(message->role);
            ss << "<|" << role << "|>" << "\n" << message->content;
        }
        if (add_ass) {
-            ss << "<|assistant|>";
+            ss << "<|assistant|>\n";
        }
    } else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
        for (auto message : chat) {
@ -641,6 +645,21 @@ int32_t llm_chat_apply_template(
        if (add_ass) {
            ss << "Assistant:";
        }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_DOTS1) {
+        // dots.llm1.inst (DOTS1)
+        for (auto message : chat) {
+            std::string role(message->role);
+            if (role == "system") {
+                ss << "<|system|>" << message->content << "<|endofsystem|>";
+            } else if (role == "user") {
+                ss << "<|userprompt|>" << message->content << "<|endofuserprompt|>";
+            } else {
+                ss << "<|response|>" << message->content << "<|endofresponse|>";
+            }
+        }
+        if (add_ass) {
+            ss << "<|response|>";
+        }
    } else {
        // template not supported
        return -1;
--- a/examples/talk-llama/llama-chat.h
+++ b/examples/talk-llama/llama-chat.h
@ -14,6 +14,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_MISTRAL_V3,
    LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
    LLM_CHAT_TEMPLATE_MISTRAL_V7,
+    LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
    LLM_CHAT_TEMPLATE_PHI_3,
    LLM_CHAT_TEMPLATE_PHI_4,
    LLM_CHAT_TEMPLATE_FALCON_3,
@ -29,8 +30,8 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_DEEPSEEK_3,
    LLM_CHAT_TEMPLATE_COMMAND_R,
    LLM_CHAT_TEMPLATE_LLAMA_3,
-    LLM_CHAT_TEMPLATE_CHATGML_3,
-    LLM_CHAT_TEMPLATE_CHATGML_4,
+    LLM_CHAT_TEMPLATE_CHATGLM_3,
+    LLM_CHAT_TEMPLATE_CHATGLM_4,
    LLM_CHAT_TEMPLATE_GLMEDGE,
    LLM_CHAT_TEMPLATE_MINICPM,
    LLM_CHAT_TEMPLATE_EXAONE_3,
@ -42,6 +43,7 @@ enum llm_chat_template {
    LLM_CHAT_TEMPLATE_BAILING,
    LLM_CHAT_TEMPLATE_LLAMA4,
    LLM_CHAT_TEMPLATE_SMOLVLM,
+    LLM_CHAT_TEMPLATE_DOTS1,
    LLM_CHAT_TEMPLATE_UNKNOWN,
 };

--- a/examples/talk-llama/llama-context.cpp
+++ b/examples/talk-llama/llama-context.cpp
--- a/examples/talk-llama/llama-context.h
+++ b/examples/talk-llama/llama-context.h
@ -1,22 +1,25 @@
 #pragma once

 #include "llama.h"
-#include "llama-batch.h"
 #include "llama-cparams.h"
 #include "llama-graph.h"
 #include "llama-adapter.h"

 #include "ggml-cpp.h"
+#include "ggml-opt.h"

 #include <map>
 #include <vector>

 struct llama_model;
-struct llama_kv_cache;
+class llama_batch_allocr;

 class llama_io_read_i;
 class llama_io_write_i;

+struct llama_memory_i;
+struct llama_memory_state_i;
+
 struct llama_context {
    // init scheduler and compute buffers, reserve worst-case graphs
    llama_context(
@ -27,7 +30,12 @@ struct llama_context {

    void synchronize();

-    const llama_model & get_model() const;
+    const llama_model   & get_model()   const;
+    const llama_cparams & get_cparams() const;
+
+    ggml_backend_sched_t get_sched() const;
+
+    ggml_context * get_ctx_compute() const;

    uint32_t n_ctx()         const;
    uint32_t n_ctx_per_seq() const;
@ -38,10 +46,12 @@ struct llama_context {
    uint32_t n_threads()       const;
    uint32_t n_threads_batch() const;

-          llama_kv_cache * get_kv_self();
-    const llama_kv_cache * get_kv_self() const;
+    llama_memory_t get_memory() const;

-    void kv_self_update();
+    // return true of the KV cache was updated
+    // TODO: remove
+    bool kv_self_update(bool optimize);
+    void kv_self_defrag_sched();

    enum llama_pooling_type pooling_type() const;

@ -82,8 +92,18 @@ struct llama_context {
                int32_t   il_start,
                int32_t   il_end);

-    int encode(llama_batch & inp_batch);
-    int decode(llama_batch & inp_batch);
+    // process a single ubatch with a specific graph type
+    // if memory_state is provided, it will be applied first to the context's memory
+    // ret contains the status of the graph computation
+    // returns nullptr only if ret != GGML_STATUS_SUCCESS
+    llm_graph_result_ptr process_ubatch(
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+            llama_memory_state_i * mstate,
+                     ggml_status & ret);
+
+    int encode(const llama_batch & batch_inp);
+    int decode(const llama_batch & batch_inp);

    //
    // state save/load
@ -128,6 +148,32 @@ struct llama_context {
    llama_perf_context_data perf_get_data() const;
    void perf_reset();

+    //
+    // training
+    //
+
+    void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
+
+    void opt_epoch(
+            ggml_opt_dataset_t      dataset,
+            ggml_opt_result_t       result_train,
+            ggml_opt_result_t       result_eval,
+            int64_t                 idata_split,
+            ggml_opt_epoch_callback callback_train,
+            ggml_opt_epoch_callback callback_eval);
+
+    void opt_epoch_iter(
+            ggml_opt_dataset_t               dataset,
+            ggml_opt_result_t                result,
+            const std::vector<llama_token> & tokens,
+            const std::vector<llama_token> & labels_sparse,
+            llama_batch                    & batch,
+            ggml_opt_epoch_callback          callback,
+            bool                             train,
+            int64_t                          idata_in_loop,
+            int64_t                          ndata_in_loop,
+            int64_t                          t_loop_start);
+
 private:
    //
    // output
@ -135,52 +181,34 @@ private:

    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
-    int32_t output_reserve(int32_t n_outputs);
-
-    // make the outputs have the same order they had in the user-provided batch
-    // TODO: maybe remove this
-    void output_reorder();
+    uint32_t output_reserve(int32_t n_outputs);

    //
    // graph
    //

+public:
    int32_t graph_max_nodes() const;

    // zero-out inputs and create the ctx_compute for the compute graph
    ggml_cgraph * graph_init();

-    llm_graph_result_ptr graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-          llm_graph_type   gtype);
-
    // returns the result of ggml_backend_sched_graph_compute_async execution
-    ggml_status graph_compute(
-            ggml_cgraph * gf,
-                   bool   batched);
+    ggml_status graph_compute(ggml_cgraph * gf, bool batched);
+
+    // reserve a graph with a dummy ubatch of the specified size
+    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate);
+
+private:
+    llm_graph_result_ptr graph_build(
+                    ggml_context * ctx,
+                     ggml_cgraph * gf,
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+      const llama_memory_state_i * mstate);

    llm_graph_cb graph_get_cb() const;

-    // used by kv_self_update()
-    ggml_tensor * build_rope_shift(
-        ggml_context * ctx0,
-        ggml_tensor * cur,
-        ggml_tensor * shift,
-        ggml_tensor * factors,
-              float   freq_base,
-              float   freq_scale,
-        ggml_backend_buffer * bbuf) const;
-
-    llm_graph_result_ptr build_kv_self_shift(
-            ggml_context * ctx0,
-            ggml_cgraph * gf) const;
-
-    llm_graph_result_ptr build_kv_self_defrag(
-            ggml_context * ctx0,
-            ggml_cgraph * gf) const;
-
    // TODO: read/write lora adapters and cvec
    size_t state_write_data(llama_io_write_i & io);
    size_t state_read_data (llama_io_read_i  & io);
@ -197,14 +225,13 @@ private:
    llama_cparams       cparams;
    llama_adapter_cvec  cvec;
    llama_adapter_loras loras;
-    llama_sbatch        sbatch;

    llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably

-    std::unique_ptr<llama_kv_cache_unified> kv_self;
+    std::unique_ptr<llama_memory_i> memory;

-    // TODO: remove
-    bool logits_all = false;
+    // TODO: temporary, until the llama_kv_self_defrag() API is removed
+    bool memory_force_optimize = false;

    // decode output (2-dimensional array: [n_outputs][n_vocab])
    size_t  logits_size = 0; // capacity (of floats) for logits
@ -219,8 +246,10 @@ private:
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;

-    int32_t n_outputs     = 0; // number of actually-used outputs in the current ubatch or last logical batch
-    int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
+    // reuse the batch_allocr to avoid unnecessary memory allocations
+    std::unique_ptr<llama_batch_allocr> balloc;
+
+    uint32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch

    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers

@ -231,6 +260,9 @@ private:

    ggml_context_ptr ctx_compute;

+    // training
+    ggml_opt_context_t opt_ctx = nullptr;
+
    ggml_threadpool_t threadpool       = nullptr;
    ggml_threadpool_t threadpool_batch = nullptr;

--- a/examples/talk-llama/llama-cparams.cpp
+++ b/examples/talk-llama/llama-cparams.cpp
@ -1 +1,5 @@
 #include "llama-cparams.h"
+
+size_t llama_max_parallel_sequences(void) {
+    return LLAMA_MAX_SEQ;
+}
--- a/examples/talk-llama/llama-cparams.h
+++ b/examples/talk-llama/llama-cparams.h
@ -4,6 +4,8 @@

 #include <cstdint>

+#define LLAMA_MAX_SEQ 64
+
 struct llama_cparams {
    uint32_t n_ctx;           // context size used during inference
    uint32_t n_batch;
@ -30,6 +32,7 @@ struct llama_cparams {
    bool flash_attn;
    bool no_perf;
    bool warmup;
+    bool op_offload;

    enum llama_pooling_type pooling_type;

--- a/examples/talk-llama/llama-grammar.cpp
+++ b/examples/talk-llama/llama-grammar.cpp
@ -1177,8 +1177,18 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
            for (const auto & trigger_pattern : grammar.trigger_patterns) {
                if (std::regex_match(grammar.trigger_buffer, match, trigger_pattern.regex)) {
                    grammar.awaiting_trigger = false;
-                    // get from the first match to the end of the string
-                    auto constrained_str = grammar.trigger_buffer.substr(match.position(1));
+                    // get from the first matched capturing group to the end of the string
+                    size_t start = std::string::npos;
+                    for (auto i = 1u; i < match.size(); i++) {
+                        if (match.length(i) > 0) {
+                            start = match.position(i);
+                            break;
+                        }
+                    }
+                    if (start == std::string::npos) {
+                        start = match.position(0);
+                    }
+                    auto constrained_str = grammar.trigger_buffer.substr(start);
                    // std::string constrained_str(match[1].first, grammar.trigger_buffer.end());
                    grammar.trigger_buffer.clear();
                    llama_grammar_accept_str(grammar, constrained_str);
--- a/examples/talk-llama/llama-graph.cpp
+++ b/examples/talk-llama/llama-graph.cpp
--- a/examples/talk-llama/llama-graph.h
+++ b/examples/talk-llama/llama-graph.h
@ -17,8 +17,12 @@ struct ggml_tensor;
 struct llama_ubatch;
 struct llama_cparams;

-class llama_memory_i;
-class llama_kv_cache_unified;
+struct llama_memory_state_i;
+
+class llama_kv_cache_unified_state;
+class llama_kv_cache_unified_iswa_state;
+class llama_memory_recurrent_state;
+class llama_memory_hybrid_state;

 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
@ -33,6 +37,7 @@ enum llm_ffn_op_type {
    LLM_FFN_RELU,
    LLM_FFN_RELU_SQR,
    LLM_FFN_SWIGLU,
+    LLM_FFN_GEGLU,
 };

 enum llm_ffn_gate_type {
@ -90,29 +95,27 @@ public:

 class llm_graph_input_pos : public llm_graph_input_i {
 public:
-    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
+    llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
    virtual ~llm_graph_input_pos() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * pos = nullptr; // I32 [n_batch]

-    const int64_t n_pos_per_token = 1;
+    const uint32_t n_pos_per_embd = 1;
 };

 // temperature tuning, used by llama4
 class llm_graph_input_attn_temp : public llm_graph_input_i {
 public:
-    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
-        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
    virtual ~llm_graph_input_attn_temp() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * attn_scale = nullptr; // F32 [n_batch]

-    const int64_t n_pos_per_token = 1;
-
    const uint32_t n_attn_temp_floor_scale;
    const float    f_attn_temp_scale;
 };
@ -133,7 +136,7 @@ class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
 public:
    llm_graph_input_pos_bucket_kv(
            const llama_hparams & hparams,
-            const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {}
+            const llama_kv_cache_unified_state * kv_state) : hparams(hparams), kv_state(kv_state) {}
    virtual ~llm_graph_input_pos_bucket_kv() = default;

    void set_input(const llama_ubatch * ubatch) override;
@ -141,7 +144,7 @@ public:
    ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]

    const llama_hparams & hparams;
-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_state * kv_state;
 };

 class llm_graph_input_out_ids : public llm_graph_input_i {
@ -186,28 +189,16 @@ public:
    const llama_cparams & cparams;
 };

-class llm_graph_input_s_copy : public llm_graph_input_i {
+class llm_graph_input_rs : public llm_graph_input_i {
 public:
-    llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
-    virtual ~llm_graph_input_s_copy() = default;
+    llm_graph_input_rs(const llama_memory_recurrent_state * mem_state) : mem_state(mem_state) {}
+    virtual ~llm_graph_input_rs() = default;

    void set_input(const llama_ubatch * ubatch) override;

    ggml_tensor * s_copy; // I32 [kv_size]

-    const llama_kv_cache_unified * kv_self;
-};
-
-class llm_graph_input_s_mask : public llm_graph_input_i {
-public:
-    llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
-    virtual ~llm_graph_input_s_mask() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * s_mask; // F32 [1, n_kv]
-
-    const llama_kv_cache_unified * kv_self;
+    const llama_memory_recurrent_state * mem_state;
 };

 class llm_graph_input_cross_embd : public llm_graph_input_i {
@ -247,15 +238,40 @@ public:
    llm_graph_input_attn_kv_unified(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
-            const llama_kv_cache_unified * kv_self) :
+            const llama_kv_cache_unified_state * kv_state) :
        hparams(hparams),
        cparams(cparams),
-        kv_self(kv_self) {
+        kv_state(kv_state) {
    }
    ~llm_graph_input_attn_kv_unified() = default;

    void set_input(const llama_ubatch * ubatch) override;

+    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const llama_kv_cache_unified_state * kv_state;
+};
+
+class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_kv_unified_iswa(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_kv_cache_unified_iswa_state * kv_state) :
+        hparams(hparams),
+        cparams(cparams),
+        kv_state(kv_state) {
+    }
+    ~llm_graph_input_attn_kv_unified_iswa() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }

@ -267,7 +283,7 @@ public:
    const llama_hparams & hparams;
    const llama_cparams & cparams;

-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_iswa_state * kv_state;
 };

 class llm_graph_input_attn_cross : public llm_graph_input_i {
@ -285,6 +301,33 @@ public:
    const llama_cross * cross = nullptr;
 };

+class llm_graph_input_mem_hybrid : public llm_graph_input_i {
+public:
+    llm_graph_input_mem_hybrid(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_memory_hybrid_state * mem_state) :
+        hparams(hparams),
+        cparams(cparams),
+        mem_state(mem_state) {
+    }
+    virtual ~llm_graph_input_mem_hybrid() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * s_copy; // I32 [kv_size]
+
+    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const llama_memory_hybrid_state * mem_state;
+};
+
 //
 // llm_graph_result
 //
@ -299,6 +342,7 @@ class llm_graph_result_i {
 public:
    virtual ~llm_graph_result_i() = default;

+    virtual ggml_tensor * get_tokens()      = 0;
    virtual ggml_tensor * get_logits()      = 0;
    virtual ggml_tensor * get_embd()        = 0;
    virtual ggml_tensor * get_embd_pooled() = 0;
@ -313,6 +357,7 @@ class llm_graph_result : public llm_graph_result_i {
 public:
    virtual ~llm_graph_result() = default;

+    ggml_tensor * get_tokens()      override { return t_tokens; }
    ggml_tensor * get_logits()      override { return t_logits; }
    ggml_tensor * get_embd()        override { return t_embd; }
    ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
@ -329,6 +374,7 @@ public:
    }

    // important graph nodes
+    ggml_tensor * t_tokens      = nullptr;
    ggml_tensor * t_logits      = nullptr;
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;
@ -352,15 +398,15 @@ struct llm_graph_params {
    const llama_cparams & cparams;
    const llama_ubatch  & ubatch;

-    ggml_backend_sched * sched;
-    ggml_backend * backend_cpu;
+    ggml_backend_sched_t sched;
+    ggml_backend_t backend_cpu;

-    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras * loras;
-    const llama_memory_i      * memory;
-    const llama_cross         * cross;
+    const llama_adapter_cvec   * cvec;
+    const llama_adapter_loras  * loras;
+    const llama_memory_state_i * mstate;
+    const llama_cross          * cross;

-    int32_t n_outputs;
+    uint32_t n_outputs;

    const llm_graph_cb & cb;
 };
@ -376,7 +422,6 @@ struct llm_graph_context {
    const int64_t n_layer;
    const int64_t n_rot;
    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
-    const int64_t n_ctx_per_seq;
    const int64_t n_head;
    const int64_t n_head_kv;
    const int64_t n_embd_head_k;
@ -395,8 +440,8 @@ struct llm_graph_context {
    const float norm_eps;
    const float norm_rms_eps;

-    const int32_t n_tokens;
-    const int32_t n_outputs;
+    const int64_t n_tokens;
+    const int64_t n_outputs;
    const int32_t n_ctx_orig; // yarn

    const enum llama_pooling_type pooling_type;
@ -404,14 +449,14 @@ struct llm_graph_context {

    ggml_context * ctx0 = nullptr;

-    ggml_backend_sched * sched;
+    ggml_backend_sched_t sched;

-    ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
+    ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?

-    const llama_adapter_cvec  * cvec;
-    const llama_adapter_loras * loras;
-    const llama_memory_i      * memory;
-    const llama_cross         * cross;
+    const llama_adapter_cvec   * cvec;
+    const llama_adapter_loras  * loras;
+    const llama_memory_state_i * mstate;
+    const llama_cross          * cross;

    const llm_graph_cb & cb_func;

@ -419,8 +464,6 @@ struct llm_graph_context {

    llm_graph_context(const llm_graph_params & params);

-    int64_t n_pos_per_token() const;
-
    void cb(ggml_tensor * cur, const char * name, int il) const;

    //
@ -491,27 +534,26 @@ struct llm_graph_context {
    ggml_tensor * build_inp_out_ids() const;
    ggml_tensor * build_inp_mean() const;
    ggml_tensor * build_inp_cls() const;
-    ggml_tensor * build_inp_s_copy() const;
-    ggml_tensor * build_inp_s_mask() const;

    ggml_tensor * build_inp_cross_embd() const;
    ggml_tensor * build_inp_pos_bucket_enc() const;
    ggml_tensor * build_inp_pos_bucket_dec() const;
    ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;

+    llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
+
    //
    // attention
    //

    ggml_tensor * build_attn_mha(
             ggml_cgraph * gf,
-             ggml_tensor * q,     // [n_embd_head_q, n_tokens, n_head_q]
-             ggml_tensor * k,     // [n_embd_head_k, n_tokens, n_head_k]
-             ggml_tensor * v,     // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
+             ggml_tensor * q,       // [n_embd_head_q, n_head_q, n_tokens]
+             ggml_tensor * k,       // [n_embd_head_k, n_head_k, n_tokens]
+             ggml_tensor * v,       // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
             ggml_tensor * kq_b,
             ggml_tensor * kq_mask,
-             ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
-                    bool   v_trans,
+             ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale) const;

    llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
@ -544,6 +586,21 @@ struct llm_graph_context {
                  float   kq_scale,
                    int   il) const;

+    llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
+
+    ggml_tensor * build_attn(
+            llm_graph_input_attn_kv_unified_iswa * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                  float   kq_scale,
+                    int   il) const;
+
    llm_graph_input_attn_cross * build_attn_inp_cross() const;

    ggml_tensor * build_attn(
@ -559,23 +616,62 @@ struct llm_graph_context {
                  float   kq_scale,
                    int   il) const;

+    ggml_tensor * build_attn(
+            llm_graph_input_mem_hybrid * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                  float   kq_scale,
+                    int   il) const;
    //
    // recurrent
    //

-    ggml_tensor * build_copy_mask_state(
-             ggml_cgraph * gf,
-             ggml_tensor * s,
-             ggml_tensor * state_copy,
-             ggml_tensor * state_mask,
-                 int32_t   n_state,
-                 int32_t   n_seqs) const;
+    // TODO: avoid notion of "kv"
+    // TODO: move this implementation to llama_memory_recurrent.
+    //       this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
+    //       when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
+    //         implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
+    //         `llama_memory_recurrent`
+    ggml_tensor * build_rs(
+            ggml_cgraph * gf,
+            ggml_tensor * s,
+            ggml_tensor * state_copy,
+                int32_t   state_size,
+                int32_t   n_seqs,
+               uint32_t   n_kv,
+               uint32_t   kv_head,
+               uint32_t   kv_size,
+                int32_t   rs_zero,
+                   bool   avoid_copies = false) const;
+
+    llm_graph_input_rs * build_rs_inp() const;
+
+    ggml_tensor * build_rs(
+            llm_graph_input_rs * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * s,
+                int32_t   state_size,
+                int32_t   n_seqs,
+                   bool   avoid_copies = false) const;
+
+    ggml_tensor * build_rs(
+            llm_graph_input_mem_hybrid * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * s,
+                int32_t   state_size,
+                int32_t   n_seqs,
+                   bool   avoid_copies = false) const;

    ggml_tensor * build_rwkv_token_shift_load(
-             ggml_cgraph * gf,
-             ggml_tensor * state_copy,
-             ggml_tensor * state_mask,
-      const llama_ubatch & ubatch,
+        llm_graph_input_rs * inp,
+               ggml_cgraph * gf,
+        const llama_ubatch & ubatch,
                     int   il) const;

    ggml_tensor * build_rwkv_token_shift_store(
@ -594,3 +690,6 @@ struct llm_graph_context {
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;
 };
+
+// TODO: better name
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
--- a/examples/talk-llama/llama-hparams.cpp
+++ b/examples/talk-llama/llama-hparams.cpp
@ -2,6 +2,22 @@

 #include "ggml.h"

+void llama_hparams::set_swa_pattern(uint32_t n_pattern) {
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        swa_layers[il] = n_pattern == 0 || (il % n_pattern < (n_pattern - 1));
+    }
+}
+
+bool llama_hparams::is_swa_any() const {
+    for (uint32_t il = 0; il < n_layer; ++il) {
+        if (swa_layers[il]) {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 uint32_t llama_hparams::n_head(uint32_t il) const {
    if (il < n_layer) {
        return n_head_arr[il];
@ -49,7 +65,7 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
    return n_embd_head_v * n_head_kv;
 }

-uint32_t llama_hparams::n_embd_k_s() const {
+uint32_t llama_hparams::n_embd_r() const {
    if (wkv_head_size != 0) {
        // for RWKV models
        return token_shift_count * n_embd;
@ -60,7 +76,7 @@ uint32_t llama_hparams::n_embd_k_s() const {
    return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
 }

-uint32_t llama_hparams::n_embd_v_s() const {
+uint32_t llama_hparams::n_embd_s() const {
    if (wkv_head_size != 0) {
        // corresponds to RWKV's wkv_states size
        return n_embd * wkv_head_size;
@ -70,9 +86,17 @@ uint32_t llama_hparams::n_embd_v_s() const {
    return ssm_d_state * ssm_d_inner;
 }

+bool llama_hparams::is_recurrent(uint32_t il) const {
+    return recurrent_layer_arr[il];
+}
+
+uint32_t llama_hparams::n_pos_per_embd() const {
+    return rope_type == LLAMA_ROPE_TYPE_MROPE ? 4 : 1;
+}
+
 bool llama_hparams::is_swa(uint32_t il) const {
    if (il < n_layer) {
-        return n_swa > 0 && n_swa_pattern > 0 && il % n_swa_pattern < (n_swa_pattern - 1);
+        return swa_layers[il];
    }

    GGML_ABORT("fatal error");
--- a/examples/talk-llama/llama-hparams.h
+++ b/examples/talk-llama/llama-hparams.h
@ -14,6 +14,12 @@ enum llama_expert_gating_func_type {
    LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
 };

+enum llama_swa_type {
+    LLAMA_SWA_TYPE_NONE     = 0,
+    LLAMA_SWA_TYPE_STANDARD = 1,
+    LLAMA_SWA_TYPE_CHUNKED  = 2,
+};
+
 struct llama_hparams_posnet {
    uint32_t n_embd;
    uint32_t n_layer;
@ -35,8 +41,6 @@ struct llama_hparams {
    uint32_t n_embd_features = 0;
    uint32_t n_layer;
    uint32_t n_rot;
-    uint32_t n_swa = 0; // sliding window attention (SWA)
-    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
    uint32_t n_expert = 0;
@ -66,6 +70,7 @@ struct llama_hparams {
    float    expert_weights_scale = 0.0;
    bool     expert_weights_norm  = false;
    uint32_t expert_gating_func   = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
+    uint32_t moe_every_n_layers   = 0;

    float f_norm_eps;
    float f_norm_rms_eps;
@ -95,12 +100,24 @@ struct llama_hparams {

    std::array<int, 4> rope_sections;

+    // Sliding Window Attention (SWA)
+    llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+    // the size of the sliding window (0 - no SWA)
+    uint32_t n_swa = 0;
+    // if swa_layers[il] == true, then layer il is SWA
+    // if swa_layers[il] == false, then layer il is dense (i.e. non-SWA)
+    // by default, all layers are dense
+    std::array<bool, LLAMA_MAX_LAYERS> swa_layers;
+
    // for State Space Models
    uint32_t ssm_d_conv  = 0;
    uint32_t ssm_d_inner = 0;
    uint32_t ssm_d_state = 0;
    uint32_t ssm_dt_rank = 0;

+    // for hybrid state space models
+    std::array<bool, LLAMA_MAX_LAYERS> recurrent_layer_arr;
+
    bool ssm_dt_b_c_rms = false;

    float f_clamp_kqv      = 0.0f;
@ -115,11 +132,13 @@ struct llama_hparams {
    bool causal_attn   = true;
    bool use_alibi     = false;
    bool attn_soft_cap = false;
+    bool use_kq_norm   = true;

+    // for Classifiers
+    uint32_t n_cls_out = 1;
+
+    // llama4
    uint32_t n_moe_layer_step        = 0;
-    bool     use_kq_norm             = true;
-    uint32_t n_attn_chunk            = 0;
-    // values below seems to be fixed on llama4
    uint32_t n_no_rope_layer_step    = 4;
    uint32_t n_attn_temp_floor_scale = 8192;
    float    f_attn_temp_scale       = 0.1;
@ -132,6 +151,23 @@ struct llama_hparams {
    enum llama_rope_type         rope_type               = LLAMA_ROPE_TYPE_NONE;
    enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;

+    // this value n_pattern means that every nth layer is dense (i.e. non-SWA)
+    // note that if n_pattern == 0, all layers are SWA
+    //           if n_pattern == 1, all layers are dense
+    // example: n_pattern = 3
+    //   il == 0: swa
+    //   il == 1: swa
+    //   il == 2: dense
+    //   il == 3: swa
+    //   il == 4: swa
+    //   il == 5: dense
+    //   il == 6: swa
+    //   etc ...
+    void set_swa_pattern(uint32_t n_pattern);
+
+    // return true if one of the layers is SWA
+    bool is_swa_any() const;
+
    uint32_t n_head(uint32_t il = 0) const;

    uint32_t n_head_kv(uint32_t il = 0) const;
@ -148,10 +184,15 @@ struct llama_hparams {

    // dimension of the rolling state embeddings
    // corresponds to Mamba's conv_states size or RWKV's token_shift states size
-    uint32_t n_embd_k_s() const;
+    uint32_t n_embd_r() const;

    // dimension of the recurrent state embeddings
-    uint32_t n_embd_v_s() const;
+    uint32_t n_embd_s() const;
+
+    // whether or not the given layer is recurrent (for hybrid models)
+    bool is_recurrent(uint32_t il) const;
+
+    uint32_t n_pos_per_embd() const;

    bool is_swa(uint32_t il) const;
 };
--- a/examples/talk-llama/llama-kv-cache-unified-iswa.cpp
+++ b/examples/talk-llama/llama-kv-cache-unified-iswa.cpp
@ -0,0 +1,279 @@
+#include "llama-kv-cache-unified-iswa.h"
+
+#include "llama-impl.h"
+#include "llama-batch.h"
+#include "llama-model.h"
+
+#include <algorithm>
+#include <cassert>
+
+//
+// llama_kv_cache_unified_iswa
+//
+
+llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
+        const llama_model & model,
+                ggml_type   type_k,
+                ggml_type   type_v,
+                     bool   v_trans,
+                     bool   offload,
+                     bool   swa_full,
+                 uint32_t   kv_size,
+                 uint32_t   n_seq_max,
+                 uint32_t   n_ubatch,
+                 uint32_t   n_pad) : hparams(model.hparams) {
+    llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
+    llama_kv_cache_unified::layer_filter_cb filter_swa  = [&](int32_t il) { return  model.hparams.is_swa(il); };
+
+    const uint32_t size_base = kv_size;
+
+    uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
+
+    // when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
+    if (swa_full) {
+        LLAMA_LOG_WARN("%s: using full-size SWA cache (ref: %s)\n",
+                __func__, "https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055");
+
+        size_swa = size_base;
+    }
+
+    LLAMA_LOG_INFO("%s: creating non-SWA KV cache, size = %u cells\n", __func__, size_base);
+
+    kv_base = std::make_unique<llama_kv_cache_unified>(
+            model, std::move(filter_base), type_k, type_v,
+            v_trans, offload, size_base, n_seq_max, n_pad,
+            0, LLAMA_SWA_TYPE_NONE);
+
+    LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
+
+    kv_swa = std::make_unique<llama_kv_cache_unified>(
+            model, std::move(filter_swa), type_k, type_v,
+            v_trans, offload, size_swa, n_seq_max, n_pad,
+            hparams.n_swa, hparams.swa_type);
+}
+
+void llama_kv_cache_unified_iswa::clear(bool data) {
+    kv_base->clear(data);
+    kv_swa ->clear(data);
+}
+
+bool llama_kv_cache_unified_iswa::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    bool res = true;
+
+    res = res & kv_base->seq_rm(seq_id, p0, p1);
+    res = res & kv_swa ->seq_rm(seq_id, p0, p1);
+
+    return res;
+}
+
+void llama_kv_cache_unified_iswa::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    kv_base->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    kv_swa ->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_kv_cache_unified_iswa::seq_keep(llama_seq_id seq_id) {
+    kv_base->seq_keep(seq_id);
+    kv_swa ->seq_keep(seq_id);
+}
+
+void llama_kv_cache_unified_iswa::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    kv_base->seq_add(seq_id, p0, p1, shift);
+    kv_swa ->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_kv_cache_unified_iswa::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    kv_base->seq_div(seq_id, p0, p1, d);
+    kv_swa ->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_kv_cache_unified_iswa::seq_pos_min(llama_seq_id seq_id) const {
+    // the base cache is a superset of the SWA cache, so we can just check the SWA cache
+    return kv_swa->seq_pos_min(seq_id);
+}
+
+llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
+    return kv_swa->seq_pos_max(seq_id);
+}
+
+llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+    GGML_UNUSED(embd_all);
+
+    // first try simple split
+    do {
+        balloc.split_reset();
+
+        std::vector<llama_ubatch> ubatches;
+        while (true) {
+            auto ubatch = balloc.split_simple(n_ubatch);
+
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+
+            ubatches.push_back(std::move(ubatch)); // NOLINT
+        }
+
+        auto heads_base = kv_base->prepare(ubatches);
+        if (heads_base.empty()) {
+            break;
+        }
+
+        auto heads_swa = kv_swa->prepare(ubatches);
+        if (heads_swa.empty()) {
+            break;
+        }
+
+        assert(heads_base.size() == heads_swa.size());
+
+        return std::make_unique<llama_kv_cache_unified_iswa_state>(
+                this, std::move(heads_base), std::move(heads_swa), std::move(ubatches));
+    } while (false);
+
+    // if it fails, try equal split
+    do {
+        balloc.split_reset();
+
+        std::vector<llama_ubatch> ubatches;
+        while (true) {
+            auto ubatch = balloc.split_equal(n_ubatch);
+
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+
+            ubatches.push_back(std::move(ubatch)); // NOLINT
+        }
+
+        auto heads_base = kv_base->prepare(ubatches);
+        if (heads_base.empty()) {
+            break;
+        }
+
+        auto heads_swa = kv_swa->prepare(ubatches);
+        if (heads_swa.empty()) {
+            break;
+        }
+
+        assert(heads_base.size() == heads_swa.size());
+
+        return std::make_unique<llama_kv_cache_unified_iswa_state>(
+                this, std::move(heads_base), std::move(heads_swa), std::move(ubatches));
+    } while (false);
+
+    // TODO: if we fail again, we should attempt different splitting strategies
+    //       but to do that properly, we first have to refactor the batches to be more flexible
+
+    return std::make_unique<llama_kv_cache_unified_iswa_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_state_ptr llama_kv_cache_unified_iswa::init_full() {
+    return std::make_unique<llama_kv_cache_unified_iswa_state>(this);
+}
+
+llama_memory_state_ptr llama_kv_cache_unified_iswa::init_update(llama_context * lctx, bool optimize) {
+    return std::make_unique<llama_kv_cache_unified_iswa_state>(this, lctx, optimize);
+}
+
+bool llama_kv_cache_unified_iswa::get_can_shift() const {
+    return kv_base->get_size() == kv_swa->get_size();
+}
+
+void llama_kv_cache_unified_iswa::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    kv_base->state_write(io, seq_id);
+    kv_swa ->state_write(io, seq_id);
+}
+
+void llama_kv_cache_unified_iswa::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    kv_base->state_read(io, seq_id);
+    kv_swa ->state_read(io, seq_id);
+}
+
+llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_base() const {
+    return kv_base.get();
+}
+
+llama_kv_cache_unified * llama_kv_cache_unified_iswa::get_swa() const {
+    return kv_swa.get();
+}
+
+//
+// llama_kv_cache_unified_iswa_state
+//
+
+llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(llama_memory_status status) : status(status) {}
+
+llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
+        llama_kv_cache_unified_iswa * kv) :
+    state_base(kv->get_base()->init_full()),
+    state_swa (kv->get_swa ()->init_full()),
+    status(llama_memory_status_combine(state_base->get_status(), state_swa->get_status())) {
+}
+
+llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
+        llama_kv_cache_unified_iswa * kv,
+        llama_context * lctx,
+        bool optimize) :
+    state_base(kv->get_base()->init_update(lctx, optimize)),
+    state_swa (kv->get_swa ()->init_update(lctx, optimize)),
+    status(llama_memory_status_combine(state_base->get_status(), state_swa->get_status())) {
+}
+
+llama_kv_cache_unified_iswa_state::llama_kv_cache_unified_iswa_state(
+        llama_kv_cache_unified_iswa * kv,
+        std::vector<uint32_t> heads_base,
+        std::vector<uint32_t> heads_swa,
+        std::vector<llama_ubatch> ubatches) :
+    ubatches(std::move(ubatches)),
+    // note: here we copy the ubatches. not sure if this is ideal
+    state_base(new llama_kv_cache_unified_state(kv->get_base(), std::move(heads_base), this->ubatches)),
+    state_swa (new llama_kv_cache_unified_state(kv->get_swa (), std::move(heads_swa),  this->ubatches)),
+    status(llama_memory_status_combine(state_base->get_status(), state_swa->get_status())) {
+}
+
+llama_kv_cache_unified_iswa_state:: ~llama_kv_cache_unified_iswa_state() = default;
+
+bool llama_kv_cache_unified_iswa_state::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    state_base->next();
+    state_swa ->next();
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_kv_cache_unified_iswa_state::apply() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    bool res = true;
+
+    res = res & state_base->apply();
+    res = res & state_swa ->apply();
+
+    return res;
+}
+
+llama_memory_status llama_kv_cache_unified_iswa_state::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_kv_cache_unified_iswa_state::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return ubatches[i_next];
+}
+
+const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_base() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return static_cast<const llama_kv_cache_unified_state *>(state_base.get());
+}
+
+const llama_kv_cache_unified_state * llama_kv_cache_unified_iswa_state::get_swa()  const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    return static_cast<const llama_kv_cache_unified_state *>(state_swa.get());
+}
--- a/examples/talk-llama/llama-kv-cache-unified-iswa.h
+++ b/examples/talk-llama/llama-kv-cache-unified-iswa.h
@ -0,0 +1,128 @@
+#pragma once
+
+#include "llama-kv-cache-unified.h"
+
+#include <vector>
+
+//
+// llama_kv_cache_unified_iswa
+//
+
+// utilizes two instances of llama_kv_cache_unified
+//   the first instance is for the non-SWA layers of the model and the second instance is for the SWA layers
+
+class llama_kv_cache_unified_iswa : public llama_memory_i {
+public:
+    llama_kv_cache_unified_iswa(
+            const llama_model & model,
+                    ggml_type   type_k,
+                    ggml_type   type_v,
+                         bool   v_trans,
+                         bool   offload,
+                         bool   swa_full,
+                     uint32_t   kv_size,
+                     uint32_t   n_seq_max,
+                     uint32_t   n_ubatch,
+                     uint32_t   n_pad);
+
+    ~llama_kv_cache_unified_iswa() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_state_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_state_ptr init_full() override;
+
+    llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    bool get_can_shift() const override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+
+    //
+    // llama_kv_cache_unified_iswa specific API
+    //
+
+    llama_kv_cache_unified * get_base() const;
+    llama_kv_cache_unified * get_swa () const;
+
+private:
+    const llama_hparams & hparams;
+
+    std::unique_ptr<llama_kv_cache_unified> kv_base;
+    std::unique_ptr<llama_kv_cache_unified> kv_swa;
+};
+
+class llama_kv_cache_unified_iswa_state : public llama_memory_state_i {
+public:
+    // used for errors
+    llama_kv_cache_unified_iswa_state(llama_memory_status status);
+
+    // used to create a full-cache state
+    llama_kv_cache_unified_iswa_state(
+            llama_kv_cache_unified_iswa * kv);
+
+    // used to create an update state
+    llama_kv_cache_unified_iswa_state(
+            llama_kv_cache_unified_iswa * kv,
+            llama_context * lctx,
+            bool optimize);
+
+    // used to create a state from a batch
+    llama_kv_cache_unified_iswa_state(
+            llama_kv_cache_unified_iswa * kv,
+            std::vector<uint32_t> heads_base,
+            std::vector<uint32_t> heads_swa,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_kv_cache_unified_iswa_state();
+
+    //
+    // llama_memory_state_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_kv_cache_unified_iswa_state specific API
+    //
+
+    const llama_kv_cache_unified_state * get_base() const;
+    const llama_kv_cache_unified_state * get_swa()  const;
+
+private:
+    //llama_kv_cache_unified_iswa * kv;
+
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    std::vector<llama_ubatch> ubatches;
+
+    const llama_memory_state_ptr state_base;
+    const llama_memory_state_ptr state_swa;
+
+    const llama_memory_status status;
+};
--- a/examples/talk-llama/llama-kv-cache-unified.cpp
+++ b/examples/talk-llama/llama-kv-cache-unified.cpp
--- a/examples/talk-llama/llama-kv-cache-unified.h
+++ b/examples/talk-llama/llama-kv-cache-unified.h
@ -0,0 +1,303 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cells.h"
+#include "llama-memory.h"
+
+#include <unordered_map>
+#include <vector>
+
+struct llama_cparams;
+struct llama_hparams;
+struct llama_model;
+struct llama_context;
+
+//
+// llama_kv_cache_unified
+//
+
+class llama_kv_cache_unified : public llama_memory_i {
+public:
+    static uint32_t get_padding(const llama_cparams & cparams);
+
+    // this callback is used to filter out layers that should not be included in the cache
+    using layer_filter_cb = std::function<bool(int32_t il)>;
+
+    using ubatch_heads = std::vector<uint32_t>;
+
+    struct defrag_info {
+        bool empty() const {
+            return ids.empty();
+        }
+
+        // contains information about which cell moves where:
+        //  - cell i moves to ids[i]
+        //  - if ids[i] == i || ids[i] == ids.size(), then cell i is not moved
+        std::vector<uint32_t> ids;
+    };
+
+    llama_kv_cache_unified(
+            const llama_model &  model,
+              layer_filter_cb && filter,
+                    ggml_type    type_k,
+                    ggml_type    type_v,
+                         bool    v_trans,
+                         bool    offload,
+                     uint32_t    kv_size,
+                     uint32_t    n_seq_max,
+                     uint32_t    n_pad,
+                     uint32_t    n_swa,
+               llama_swa_type    swa_type);
+
+    ~llama_kv_cache_unified() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_state_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_state_ptr init_full() override;
+
+    llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    bool get_can_shift() const override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+
+    //
+    // llama_kv_cache_unified specific API
+    //
+
+    uint32_t get_size() const;
+
+    bool get_has_shift() const;
+
+    //
+    // graph_build API
+    //
+
+    uint32_t get_n_kv() const;
+
+    // get views of the current state of the cache
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
+    ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
+
+    // store k_cur and v_cur in the cache based on the provided head location
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il, uint32_t head_cur) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il, uint32_t head_cur) const;
+
+    //
+    // preparation API
+    //
+
+    // find places for the provided ubatches in the cache, returns the head locations
+    // return empty vector on failure
+    ubatch_heads prepare(const std::vector<llama_ubatch> & ubatches);
+
+    bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo);
+
+    // return the cell position where we can insert the ubatch
+    // return -1 on failure to find a contiguous slot of kv cells
+    int32_t find_slot(const llama_ubatch & ubatch) const;
+
+    // emplace the ubatch context into slot: [head_cur, head_cur + ubatch.n_tokens)
+    void apply_ubatch(uint32_t head_cur, const llama_ubatch & ubatch);
+
+    //
+    // set_input API
+    //
+
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+    void set_input_k_shift   (ggml_tensor * dst) const;
+    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+
+private:
+    const llama_model & model;
+    const llama_hparams & hparams;
+
+    struct kv_layer {
+        // layer index in the model
+        // note: can be different from the layer index in the KV cache
+        uint32_t il;
+
+        ggml_tensor * k;
+        ggml_tensor * v;
+    };
+
+    bool v_trans = true;  // the value tensor is transposed
+
+    // the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
+    // note: this is not part of the KV state and it's only used to speed-up the find_slot() method
+    uint32_t head = 0;
+
+    const uint32_t n_seq_max = 1;
+
+    // required padding
+    const uint32_t n_pad = 1;
+
+    // SWA
+    const uint32_t n_swa = 0;
+
+    int debug = 0;
+
+    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
+
+    std::vector<ggml_context_ptr>        ctxs;
+    std::vector<ggml_backend_buffer_ptr> bufs;
+
+    llama_kv_cells_unified cells;
+
+    std::vector<kv_layer> layers;
+
+    // model layer id -> KV cache layer id
+    std::unordered_map<int32_t, int32_t> map_layer_ids;
+
+    // return non-empty vector if cells have been moved
+    defrag_info defrag_prepare(int32_t n_max_nodes) const;
+
+    size_t total_size() const;
+
+    size_t size_k_bytes() const;
+    size_t size_v_bytes() const;
+
+    bool is_masked_swa(llama_pos p0, llama_pos p1) const;
+
+    ggml_tensor * build_rope_shift(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_tensor * cur,
+                    ggml_tensor * shift,
+                    ggml_tensor * factors,
+                          float   freq_base,
+                          float   freq_scale) const;
+
+    llm_graph_result_ptr build_graph_shift(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_cgraph * gf) const;
+
+    llm_graph_result_ptr build_graph_defrag(
+            const llama_cparams & cparams,
+                   ggml_context * ctx,
+                    ggml_cgraph * gf,
+              const defrag_info & dinfo) const;
+
+    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
+
+    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
+    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
+};
+
+class llama_kv_cache_unified_state : public llama_memory_state_i {
+public:
+    // some shorthands
+    using ubatch_heads = llama_kv_cache_unified::ubatch_heads;
+    using defrag_info  = llama_kv_cache_unified::defrag_info;
+
+    // used for errors
+    llama_kv_cache_unified_state(llama_memory_status status);
+
+    // used to create a full-cache state
+    llama_kv_cache_unified_state(
+            llama_kv_cache_unified * kv);
+
+    // used to create an update state
+    llama_kv_cache_unified_state(
+            llama_kv_cache_unified * kv,
+            llama_context * lctx,
+            bool do_shift,
+            defrag_info dinfo);
+
+    // used to create a decode state from a batch
+    llama_kv_cache_unified_state(
+            llama_kv_cache_unified * kv,
+            ubatch_heads heads,
+            std::vector<llama_ubatch> ubatches);
+
+    virtual ~llama_kv_cache_unified_state();
+
+    //
+    // llama_memory_state_i
+    //
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_kv_cache_unified_state specific API
+    //
+
+    uint32_t get_n_kv() const;
+
+    // get views of the current state of the cache
+    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
+    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
+
+    // store k_cur and v_cur in the cache based on the provided head location
+    ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const;
+    ggml_tensor * cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const;
+
+    void set_input_k_shift(ggml_tensor * dst) const;
+
+    void set_input_kq_mask   (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
+    void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
+
+private:
+    llama_memory_status status;
+
+    llama_kv_cache_unified * kv;
+    llama_context * lctx;
+
+    //
+    // update state
+    //
+
+    bool do_shift = false;
+
+    defrag_info dinfo;
+
+    //
+    // batch processing state
+    //
+
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    ubatch_heads heads;
+
+    std::vector<llama_ubatch> ubatches;
+
+    //
+    // data needed for building the compute graph for the current ubatch:
+    //
+
+    // a heuristic, to avoid attending the full cache if it is not yet utilized
+    // as the cache gets filled, the benefit from this heuristic disappears
+    int32_t n_kv;
+
+    // the beginning of the current slot in which the ubatch will be inserted
+    int32_t head;
+};
--- a/examples/talk-llama/llama-kv-cache.cpp
+++ b/examples/talk-llama/llama-kv-cache.cpp
--- a/examples/talk-llama/llama-kv-cache.h
+++ b/examples/talk-llama/llama-kv-cache.h
@ -4,210 +4,41 @@
 #include "llama-io.h"
 #include "llama-memory.h"

-#include "ggml-cpp.h"
-
-#include <functional>
-#include <set>
-#include <vector>
-
-struct llama_cparams;
-struct llama_hparams;
-struct llama_ubatch;
-
 struct llama_kv_cache : public llama_memory_i {
-    using llama_memory_i::llama_memory_i;
+    virtual ~llama_kv_cache() = default;

-    virtual void restore() = 0; // call if batch processing fails - restores the cache state
-    virtual void commit() = 0;  // call after successful batch processing - clears any pending state
+    // split the input batch into a set of ubatches and verify that they can fit into the cache
+    // return a state object containing the ubatches and KV cache state required to process them
+    // check the llama_memory_state_i::get_status() for the result
+    virtual llama_memory_state_ptr init_batch(
+            const llama_batch & batch,
+            uint32_t n_ubatch,
+            bool embd_pooled,
+            bool logits_all) = 0;

-    virtual int32_t get_n_tokens()   const = 0;
-    virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
+    // simulate full cache, used for allocating worst-case compute buffers
+    virtual llama_memory_state_ptr init_full() = 0;

+    // process any pending defrag/shift/etc. operations
+    // optionally call once before processing a new batch
+    // return true if any operations were performed
+    virtual bool update(llama_context & lctx) = 0;
+
+    // schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
+    // TODO: change to
+    //   llama_memory_state_ptr init_defrag(float thold) = 0;
+    //
+    virtual void defrag_sched(float thold) = 0;
+
+    // getters
    virtual bool get_can_shift() const = 0;

    bool get_can_edit() const override { return get_can_shift(); }
+
+    //
+    // state write/read
+    //
+
+    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
+    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
 };
-
-struct llama_kv_cache_guard {
-    llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
-
-    ~llama_kv_cache_guard() {
-        kv->restore();
-    }
-
-    void commit() {
-        kv->commit();
-    }
-
-private:
-    llama_kv_cache * kv;
-};
-
-struct llama_kv_cell {
-    llama_pos pos   = -1;
-    llama_pos delta =  0;
-    int32_t   src   = -1; // used by recurrent state models to copy states
-    int32_t   tail  = -1;
-
-    std::set<llama_seq_id> seq_id;
-
-    bool has_seq_id(const llama_seq_id & id) const {
-        return seq_id.find(id) != seq_id.end();
-    }
-
-    bool is_empty() const {
-        return seq_id.empty();
-    }
-
-    bool is_same_seq(const llama_kv_cell & other) const {
-        return seq_id == other.seq_id;
-    }
-};
-
-// ring-buffer of cached KV data
-// TODO: pimpl
-// TODO: add notion of max sequences
-class llama_kv_cache_unified : public llama_kv_cache {
-public:
-    // can be used to query data from the model if needed
-    struct callbacks {
-        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
-    };
-
-    llama_kv_cache_unified(
-            const llama_hparams & hparams,
-            callbacks             cbs);
-
-    virtual ~llama_kv_cache_unified() = default;
-
-    // TODO: become constructor
-    bool init(
-            const llama_model & model,   // TODO: do not reference the model
-          const llama_cparams & cparams,
-                    ggml_type   type_k,
-                    ggml_type   type_v,
-                     uint32_t   kv_size,
-                         bool   offload);
-
-    int32_t get_n_tokens()   const override;
-    int32_t get_used_cells() const override;
-
-    size_t total_size() const;
-
-    // TODO: better data structures to reduce the cost of this operation
-    llama_pos pos_max() const;
-
-    void clear() override;
-    void defrag() override;
-
-    virtual void restore() override;
-    virtual void commit() override;
-
-    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
-    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
-    void seq_keep(llama_seq_id seq_id) override;
-    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
-    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
-
-    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
-
-    bool get_can_shift() const override;
-
-    // find an empty slot of size "n_tokens" in the cache
-    // updates the cache head
-    // Note: On success, it's important that cache.head points
-    // to the first cell of the slot.
-    bool find_slot(const llama_ubatch & batch);
-
-    // TODO: maybe not needed
-    uint32_t get_padding(const llama_cparams & cparams) const;
-
-    // find how many cells are currently in use
-    uint32_t cell_max() const;
-
-    size_t size_k_bytes() const;
-    size_t size_v_bytes() const;
-
-    // defrag
-
-    struct {
-        std::vector<uint32_t> ids;
-    } defrag_info;
-
-    // return true if cells have been moved
-    bool defrag_prepare(int32_t n_max_nodes);
-
-    // commit/restore cache
-
-    struct slot_range {
-        uint32_t c0 = 0; // note: these are cell indices, not sequence positions
-        uint32_t c1 = 0;
-    };
-
-    // pending cell updates that are not yet committed
-    struct {
-        std::vector<slot_range> ranges;
-    } pending;
-
-    // state write/load
-
-    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
-    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1);
-
-    // members
-
-    const llama_hparams & hparams;
-
-    callbacks cbs;
-
-    bool has_shift = false;
-    bool do_defrag = false;
-
-    // TODO: remove this and implement llama_kv_cache_recurrent instead
-    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
-
-    bool v_trans   = true;  // the value tensor is transposed
-    bool can_shift = false;
-
-    // Note: The value of head isn't only used to optimize searching
-    // for a free KV slot. llama_decode_impl also uses it, so it
-    // cannot be freely changed after a slot has been allocated.
-    uint32_t head = 0;
-    uint32_t size = 0;
-    uint32_t used = 0; // used cells (i.e. at least one seq_id)
-
-    // computed before each graph build
-    uint32_t n = 0;
-
-    std::vector<llama_kv_cell> cells;
-
-    std::vector<ggml_tensor *> k_l; // per layer
-    std::vector<ggml_tensor *> v_l;
-
-private:
-    ggml_type type_k = GGML_TYPE_F16;
-    ggml_type type_v = GGML_TYPE_F16;
-
-    std::vector<ggml_context_ptr>        ctxs;
-    std::vector<ggml_backend_buffer_ptr> bufs;
-
-    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
-
-    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
-};
-
-// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
-//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
-//public:
-//    using llama_kv_cache_unified::llama_kv_cache_unified;
-//};
-
-//
-// kv cache view
-//
-
-llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max);
-
-void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv);
--- a/examples/talk-llama/llama-kv-cells.h
+++ b/examples/talk-llama/llama-kv-cells.h
@ -0,0 +1,415 @@
+#pragma once
+
+#include "llama.h"
+#include "llama-cparams.h"
+
+#include <bitset>
+#include <cassert>
+#include <vector>
+#include <set>
+
+// meta information about KV cells that can be part of multiple sequences at the same time
+// TODO: add unit tests
+class llama_kv_cells_unified {
+public:
+    void reset() {
+        for (uint32_t i = 0; i < pos.size(); ++i) {
+            pos[i]   = -1;
+            shift[i] =  0;
+            seq[i].reset();
+        }
+
+        has_shift = false;
+
+        used.clear();
+
+        for (uint32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            seq_pos[s].clear();
+        }
+    }
+
+    void reset_shift() {
+        has_shift = false;
+
+        for (uint32_t i = 0; i < shift.size(); ++i) {
+            shift[i] = 0;
+        }
+    }
+
+    uint32_t size() const {
+        return pos.size();
+    }
+
+    void resize(uint32_t n) {
+        pos.resize(n);
+        shift.resize(n);
+        seq.resize(n);
+
+        reset();
+    }
+
+    bool is_empty(uint32_t i) const {
+        assert(i < pos.size());
+        assert((pos[i] < 0 && pos[i] == -1) || pos[i] >= 0);
+
+        return pos[i] == -1;
+    }
+
+    uint32_t get_used() const {
+        return used.size();
+    }
+
+    // the index of the first cell that is used
+    // return 0 if no cells are used
+    uint32_t used_min() const {
+        return used.empty() ? 0 : *used.begin();
+    }
+
+    // the index of the last cell that is used + 1
+    // return 0 if no cells are used
+    uint32_t used_max_p1() const {
+        return used.empty() ? 0 : *used.rbegin() + 1;
+    }
+
+    bool get_has_shift() const {
+        return has_shift;
+    }
+
+    // move cell isrc to idst (used during defrag)
+    void mv(uint32_t isrc, uint32_t idst) {
+        assert(isrc < pos.size());
+        assert(idst < pos.size());
+
+        assert(pos[idst] == -1);
+        assert(pos[isrc] != -1);
+
+        pos  [idst] = pos  [isrc];
+        shift[idst] = shift[isrc];
+        seq  [idst] = seq  [isrc];
+
+        pos  [isrc] = -1;
+        shift[isrc] =  0;
+        seq  [isrc].reset();
+
+        used.erase (isrc);
+        used.insert(idst);
+    }
+
+    // copy the state of cells [i, i + n) (used for save/restore the state of the cells)
+    llama_kv_cells_unified cp(uint32_t i, uint32_t n) const {
+        assert(i + n <= pos.size());
+
+        llama_kv_cells_unified res;
+
+        res.resize(n);
+
+        for (uint32_t j = 0; j < n; ++j) {
+            res.pos[j] = pos[i + j];
+            res.seq[j] = seq[i + j];
+
+            assert(shift[i + j] == 0);
+        }
+
+        return res;
+    }
+
+    // set the state of cells [i, i + other.pos.size()) (used for save/restore the state of the cells)
+    void set(uint32_t i, const llama_kv_cells_unified & other) {
+        assert(i + other.pos.size() <= pos.size());
+
+        for (uint32_t j = 0; j < other.pos.size(); ++j) {
+            if (pos[i + j] == -1 && other.pos[j] != -1) {
+                used.insert(i + j);
+            }
+
+            if (pos[i + j] != -1 && other.pos[j] == -1) {
+                used.erase(i + j);
+            }
+
+            if (pos[i + j] != -1) {
+                seq_pos_rm(i + j);
+            }
+
+            pos[i + j] = other.pos[j];
+            seq[i + j] = other.seq[j];
+
+            if (pos[i + j] != -1) {
+                seq_pos_add(i + j);
+            }
+
+            assert(shift[i + j] == 0);
+        }
+    }
+
+    // clear a non-empty cell
+    void rm(uint32_t i) {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        seq_pos_rm(i);
+        seq[i].reset();
+
+        pos[i] = -1;
+        shift[i] = 0;
+
+        used.erase(i);
+    }
+
+    // note: call only if the cell has seq_id
+    // return true if the cell becomes empty
+    bool seq_rm(uint32_t i, llama_seq_id seq_id) {
+        assert(i < pos.size());
+        assert(seq[i].test(seq_id));
+        assert(pos[i] != -1);
+        assert(seq_id >= 0);
+
+        seq[i].reset(seq_id);
+        seq_pos[seq_id].erase(pos[i]);
+
+        if (seq[i].none()) {
+            pos[i] = -1;
+            shift[i] = 0;
+
+            used.erase(i);
+
+            return true;
+        }
+
+        return false;
+    }
+
+    // return true if the cell becomes empty (i.e. it did not contain seq_id before the call)
+    bool seq_keep(uint32_t i, llama_seq_id seq_id) {
+        assert(i < pos.size());
+
+        if (seq[i].test(seq_id)) {
+            seq_pos_rm(i);
+            seq[i].reset();
+
+            seq[i].set(seq_id);
+            seq_pos[seq_id].insert(pos[i]);
+
+            return false;
+        }
+
+        if (seq[i].any()) {
+            seq_pos_rm(i);
+            seq[i].reset();
+
+            pos[i] = -1;
+            shift[i] = 0;
+
+            used.erase(i);
+
+            return true;
+        }
+
+        assert(pos[i] == -1);
+
+        return false;
+    }
+
+    // number of different sequences in the cell
+    int seq_count(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        return seq[i].count();
+    }
+
+    // check if the cell contains seq_id
+    bool seq_has(uint32_t i, llama_seq_id seq_id) const {
+        assert(i < pos.size());
+        assert(seq_id >= 0);
+
+        return seq[i].test(seq_id);
+    }
+
+    // note: call only if the cell is not empty and the seq_id is not in the cell
+    void seq_add(uint32_t i, llama_seq_id seq_id) {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+        assert(!seq[i].test(seq_id));
+
+        seq[i].set(seq_id);
+        seq_pos[seq_id].insert(pos[i]);
+    }
+
+    // return the sequence id of this cell
+    // note: call only for cells with exactly one sequence
+    llama_seq_id seq_get(uint32_t i) const {
+        assert(seq[i].count() == 1);
+
+        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            if (seq[i].test(s)) {
+                return s;
+            }
+        }
+
+        return -1;
+    }
+
+    // the minimum position of sequence seq_id currently present in any of the cells
+    // return -1 if the sequence is not present
+    llama_pos seq_pos_min(llama_seq_id seq_id) const {
+        assert(seq_id >= 0);
+        assert(seq_id < LLAMA_MAX_SEQ);
+
+        if (seq_pos[seq_id].empty()) {
+            return -1;
+        }
+
+        return *seq_pos[seq_id].begin();
+    }
+
+    // the maximum position of sequence seq_id currently present in any of the cells
+    // return -1 if the sequence is not present
+    llama_pos seq_pos_max(llama_seq_id seq_id) const {
+        assert(seq_id >= 0);
+        assert(seq_id < LLAMA_MAX_SEQ);
+
+        if (seq_pos[seq_id].empty()) {
+            return -1;
+        }
+
+        return *seq_pos[seq_id].rbegin();
+    }
+
+    // note: call only if the cell is not empty
+    llama_pos pos_get(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        return pos[i];
+    }
+
+    // note: call only if the cell is not empty
+    llama_pos get_shift(uint32_t i) const {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        return shift[i];
+    }
+
+    // check if a cell is not empty and its position is within [p0, p1)
+    bool pos_in(uint32_t i, llama_pos p0, llama_pos p1) const {
+        assert(i < pos.size());
+
+        return pos[i] >= p0 && pos[i] < p1;
+    }
+
+    // set the position of an empty cell
+    // does not modify "has_shift"
+    // note: call only if the cell is empty
+    void pos_set(uint32_t i, llama_pos p) {
+        assert(i < pos.size());
+        assert(pos[i] == -1);
+        assert(seq[i].none());
+
+        pos[i] = p;
+
+        used.insert(i);
+    }
+
+    // pos[i] = pos[i] + d
+    // sets "has_shift" to true
+    // note: call only if the cell is not empty
+    bool pos_add(uint32_t i, llama_pos d) {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        seq_pos_rm(i);
+
+        pos[i]   += d;
+        shift[i] += d;
+
+        has_shift = true;
+
+        if (pos[i] < 0) {
+            seq[i].reset();
+            pos[i] = -1;
+            shift[i] = 0;
+
+            used.erase(i);
+
+            return true;
+        }
+
+        seq_pos_add(i);
+
+        return false;
+    }
+
+    // pos[i] = pos[i] / d
+    // sets "has_shift" to true
+    // note: call only if the cell is not empty
+    void pos_div(uint32_t i, int d) {
+        assert(i < pos.size());
+        assert(pos[i] != -1);
+
+        const llama_pos p_old = pos[i];
+
+        seq_pos_rm(i);
+
+        pos[i]   /= d;
+        shift[i] += p_old - pos[i];
+
+        seq_pos_add(i);
+
+        has_shift = true;
+    }
+
+private:
+    bool has_shift = false;
+
+    // set of indices of used cells (i.e. pos[i] != -1, allowed to not have any seq_id)
+    std::set<uint32_t> used;
+
+    std::vector<llama_pos> pos;
+
+    // this array accumulates any applied shifts to the pos array since the last reset_shift() call
+    // this is used to queue multiple updates to the pos array, which in the end can be applied in one go:
+    //
+    //   cells.pos_add(x, shift_x);
+    //   cells.pos_div(y, shift_y);
+    //   ...
+    //
+    //   if (cells.has_shift()) {
+    //      for (int i = 0; i < n; ++i) {
+    //          auto shift_i = cells.get_shift(i);
+    //          ...
+    //      }
+    //      cells.reset_shift();
+    //   }
+    //
+    std::vector<llama_pos> shift;
+
+    using seq_set_t = std::bitset<LLAMA_MAX_SEQ>;
+
+    // the bitset seq[i] tells us which sequences are currently occupying the i-th cell
+    std::vector<seq_set_t> seq;
+
+    // the set seq_pos[s] tells us which positions are currently present for sequence s
+    // this way seq_pos[s].begin() and seq_pos[s].rbegin() give us the min/max positions currently in the cache
+    std::set<llama_pos> seq_pos[LLAMA_MAX_SEQ];
+
+    // helper functions for updating `seq_pos`, once cell at a time:
+
+    // remove cell i
+    void seq_pos_rm(uint32_t i) {
+        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            if (seq[i].test(s)) {
+                seq_pos[s].erase(pos[i]);
+            }
+        }
+    }
+
+    // add cell i
+    void seq_pos_add(uint32_t i) {
+        for (int s = 0; s < LLAMA_MAX_SEQ; ++s) {
+            if (seq[i].test(s)) {
+                seq_pos[s].insert(pos[i]);
+            }
+        }
+    }
+};
--- a/examples/talk-llama/llama-memory-hybrid.cpp
+++ b/examples/talk-llama/llama-memory-hybrid.cpp
@ -0,0 +1,246 @@
+#include "llama-memory-hybrid.h"
+
+#include "llama-impl.h"
+#include "llama-model.h"
+#include "llama-context.h"
+
+//
+// llama_memory_hybrid
+//
+
+llama_memory_hybrid::llama_memory_hybrid(
+    const llama_model & model,
+                         /* attn */
+            ggml_type    type_k,
+            ggml_type    type_v,
+                 bool    v_trans,
+             uint32_t    kv_size,
+             uint32_t    n_pad,
+             uint32_t    n_swa,
+       llama_swa_type    swa_type,
+                         /* recurrent */
+            ggml_type    type_r,
+            ggml_type    type_s,
+             uint32_t    rs_size,
+                         /* common */
+             uint32_t    n_seq_max,
+                 bool    offload,
+                         /* layer filters */
+      layer_filter_cb && filter_attn,
+      layer_filter_cb && filter_recr) :
+    hparams(model.hparams),
+    mem_attn(new llama_kv_cache_unified(
+        model,
+        filter_attn == nullptr ?
+            [&](int32_t il) { return !hparams.is_recurrent(il); }
+            : filter_attn,
+        type_k,
+        type_v,
+        v_trans,
+        offload,
+        kv_size,
+        n_seq_max,
+        n_pad,
+        n_swa,
+        swa_type
+    )),
+    mem_recr(new llama_memory_recurrent(
+        model,
+        filter_recr == nullptr ?
+            [&](int32_t il) { return hparams.is_recurrent(il); }
+            : filter_recr,
+        type_r,
+        type_s,
+        offload,
+        rs_size,
+        n_seq_max
+    )) {}
+
+llama_memory_state_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & balloc, uint32_t n_ubatch, bool embd_all) {
+    do {
+        balloc.split_reset();
+
+        // follow the recurrent pattern for creating the ubatch splits
+        std::vector<llama_ubatch> ubatches;
+
+        while (true) {
+            llama_ubatch ubatch;
+
+            if (embd_all) {
+                // if all tokens are output, split by sequence
+                ubatch = balloc.split_seq(n_ubatch);
+            } else {
+                ubatch = balloc.split_equal(n_ubatch);
+            }
+
+            if (ubatch.n_tokens == 0) {
+                break;
+            }
+
+            ubatches.push_back(std::move(ubatch)); // NOLINT
+        }
+
+        // prepare the recurrent batches first
+        if (!mem_recr->prepare(ubatches)) {
+            // TODO: will the recurrent cache be in an undefined state at this point?
+            LLAMA_LOG_ERROR("%s: failed to prepare recurrent ubatches\n", __func__);
+            return std::make_unique<llama_memory_hybrid_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+        }
+
+        // prepare the attention cache
+        auto heads_attn = mem_attn->prepare(ubatches);
+        if (heads_attn.empty()) {
+            LLAMA_LOG_ERROR("%s: failed to prepare attention ubatches\n", __func__);
+            return std::make_unique<llama_memory_hybrid_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+        }
+
+        return std::make_unique<llama_memory_hybrid_state>(
+                this, std::move(heads_attn), std::move(ubatches));
+    } while(false);
+
+    return std::make_unique<llama_memory_hybrid_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
+}
+
+llama_memory_state_ptr llama_memory_hybrid::init_full() {
+    return std::make_unique<llama_memory_hybrid_state>(this);
+}
+
+llama_memory_state_ptr llama_memory_hybrid::init_update(llama_context * lctx, bool optimize) {
+    return std::make_unique<llama_memory_hybrid_state>(this, lctx, optimize);
+}
+
+bool llama_memory_hybrid::get_can_shift() const {
+    // Shifting is trivially supported for recurrent
+    return mem_attn->get_can_shift();
+}
+
+void llama_memory_hybrid::clear(bool data) {
+    mem_attn->clear(data);
+    mem_recr->clear(data);
+}
+
+bool llama_memory_hybrid::seq_rm(llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    // Try removing from the recurrent cache first since it may fail. If it does
+    // fail, the cache will not have been mutated.
+    if (!mem_recr->seq_rm(seq_id, p0, p1)) {
+        return false;
+    }
+    return mem_attn->seq_rm(seq_id, p0, p1);
+}
+
+void llama_memory_hybrid::seq_cp(llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
+    mem_attn->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+    mem_recr->seq_cp(seq_id_src, seq_id_dst, p0, p1);
+}
+
+void llama_memory_hybrid::seq_keep(llama_seq_id seq_id) {
+    mem_attn->seq_keep(seq_id);
+    mem_recr->seq_keep(seq_id);
+}
+
+void llama_memory_hybrid::seq_add(llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) {
+    mem_attn->seq_add(seq_id, p0, p1, shift);
+    mem_recr->seq_add(seq_id, p0, p1, shift);
+}
+
+void llama_memory_hybrid::seq_div(llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
+    mem_attn->seq_div(seq_id, p0, p1, d);
+    mem_recr->seq_div(seq_id, p0, p1, d);
+}
+
+llama_pos llama_memory_hybrid::seq_pos_min(llama_seq_id seq_id) const {
+    // the min of the total cache is the max of the two caches' min values
+    return std::max(mem_attn->seq_pos_min(seq_id), mem_recr->seq_pos_min(seq_id));
+}
+
+llama_pos llama_memory_hybrid::seq_pos_max(llama_seq_id seq_id) const {
+    // the max of the total cache is the min of the two caches' max values
+    return std::min(mem_attn->seq_pos_max(seq_id), mem_recr->seq_pos_max(seq_id));
+}
+
+void llama_memory_hybrid::state_write(llama_io_write_i & io, llama_seq_id seq_id) const {
+    mem_attn->state_write(io, seq_id);
+    mem_recr->state_write(io, seq_id);
+}
+
+void llama_memory_hybrid::state_read(llama_io_read_i & io, llama_seq_id seq_id) {
+    mem_attn->state_read(io, seq_id);
+    mem_recr->state_read(io, seq_id);
+}
+
+llama_kv_cache_unified * llama_memory_hybrid::get_mem_attn() const {
+    return mem_attn.get();
+}
+
+llama_memory_recurrent * llama_memory_hybrid::get_mem_recr() const {
+    return mem_recr.get();
+}
+
+llama_memory_hybrid_state::llama_memory_hybrid_state(llama_memory_status status) : status(status) {}
+
+llama_memory_hybrid_state::llama_memory_hybrid_state(llama_memory_hybrid * mem) :
+    state_attn(mem->get_mem_attn()->init_full()),
+    state_recr(mem->get_mem_recr()->init_full()),
+    status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) {
+}
+
+llama_memory_hybrid_state::llama_memory_hybrid_state(
+        llama_memory_hybrid * mem,
+              llama_context * lctx,
+                       bool   optimize) :
+    state_attn(mem->get_mem_attn()->init_update(lctx, optimize)),
+    state_recr(mem->get_mem_recr()->init_update(lctx, optimize)),
+    status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) {
+}
+
+llama_memory_hybrid_state::llama_memory_hybrid_state(
+              llama_memory_hybrid * mem,
+            std::vector<uint32_t>   heads_attn,
+        std::vector<llama_ubatch>   ubatches) :
+    ubatches(std::move(ubatches)),
+    // note: here we copy the ubatches. not sure if this is ideal
+    state_attn(new llama_kv_cache_unified_state(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)),
+    state_recr(new llama_memory_recurrent_state(mem->get_mem_recr(),                        this->ubatches)),
+    status(llama_memory_status_combine(state_attn->get_status(), state_recr->get_status())) {
+}
+
+bool llama_memory_hybrid_state::next() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    state_attn->next();
+    state_recr->next();
+
+    if (++i_next >= ubatches.size()) {
+        return false;
+    }
+
+    return true;
+}
+
+bool llama_memory_hybrid_state::apply() {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+
+    bool res = true;
+
+    res = res & state_attn->apply();
+    res = res & state_recr->apply();
+
+    return res;
+}
+
+llama_memory_status llama_memory_hybrid_state::get_status() const {
+    return status;
+}
+
+const llama_ubatch & llama_memory_hybrid_state::get_ubatch() const {
+    assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
+    return ubatches[i_next];
+}
+
+const llama_kv_cache_unified_state * llama_memory_hybrid_state::get_state_attn() const {
+    return static_cast<const llama_kv_cache_unified_state *>(state_attn.get());
+}
+
+const llama_memory_recurrent_state * llama_memory_hybrid_state::get_state_recr() const {
+    return static_cast<const llama_memory_recurrent_state *>(state_recr.get());
+}
--- a/examples/talk-llama/llama-memory-hybrid.h
+++ b/examples/talk-llama/llama-memory-hybrid.h
@ -0,0 +1,138 @@
+#pragma once
+
+#include "llama-batch.h"
+#include "llama-graph.h"
+#include "llama-kv-cache-unified.h"
+#include "llama-memory.h"
+#include "llama-memory-recurrent.h"
+
+#include <memory>
+#include <vector>
+
+//
+// llama_memory_hybrid
+//
+
+// utilizes instances of llama_memory_recurrent and llama_kv_cache_unified to
+//   support models where each layer may be either attention-based or recurrent
+
+class llama_memory_hybrid : public llama_memory_i {
+public:
+
+    // this callback is used to filter out layers that should not be included in the cache
+    using layer_filter_cb = std::function<bool(int32_t il)>;
+
+    llama_memory_hybrid(
+        const llama_model & model,
+                            /* attn */
+                ggml_type    type_k,
+                ggml_type    type_v,
+                     bool    v_trans,
+                 uint32_t    kv_size,
+                 uint32_t    n_pad,
+                 uint32_t    n_swa,
+           llama_swa_type    swa_type,
+                             /* recurrent */
+                ggml_type    type_r,
+                ggml_type    type_s,
+                 uint32_t    rs_size,
+                             /* common */
+                 uint32_t    n_seq_max,
+                     bool    offload,
+                             /* layer filters */
+          layer_filter_cb && filter_attn = nullptr,
+          layer_filter_cb && filter_recr = nullptr);
+
+    ~llama_memory_hybrid() = default;
+
+    //
+    // llama_memory_i
+    //
+
+    llama_memory_state_ptr init_batch(
+            llama_batch_allocr & balloc,
+            uint32_t n_ubatch,
+            bool embd_all) override;
+
+    llama_memory_state_ptr init_full() override;
+
+    llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) override;
+
+    bool get_can_shift() const override;
+
+    void clear(bool data) override;
+
+    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
+    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
+    void seq_keep(llama_seq_id seq_id)                                                          override;
+    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) override;
+    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
+
+    llama_pos seq_pos_min(llama_seq_id seq_id) const override;
+    llama_pos seq_pos_max(llama_seq_id seq_id) const override;
+
+    // state write/load
+
+    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
+    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1)       override;
+
+    //
+    // llama_memory_hybrid specific API
+    //
+
+    llama_kv_cache_unified * get_mem_attn() const;
+    llama_memory_recurrent * get_mem_recr() const;
+
+private:
+    const llama_hparams & hparams;
+
+    const std::unique_ptr<llama_kv_cache_unified> mem_attn;
+    const std::unique_ptr<llama_memory_recurrent> mem_recr;
+};
+
+class llama_memory_hybrid_state : public llama_memory_state_i {
+public:
+    // init failure
+    explicit llama_memory_hybrid_state(llama_memory_status status);
+
+    // init full
+    explicit llama_memory_hybrid_state(llama_memory_hybrid * mem);
+
+    // init update
+    explicit llama_memory_hybrid_state(
+        llama_memory_hybrid * mem,
+              llama_context * lctx,
+                       bool   optimize);
+
+    // init success
+    llama_memory_hybrid_state(
+              llama_memory_hybrid * mem,
+            std::vector<uint32_t>   heads_attn,
+        std::vector<llama_ubatch>   ubatches);
+
+    ~llama_memory_hybrid_state() = default;
+
+    bool next()  override;
+    bool apply() override;
+
+    llama_memory_status  get_status() const override;
+    const llama_ubatch & get_ubatch() const override;
+
+    //
+    // llama_memory_hybrid_state
+    //
+
+    const llama_kv_cache_unified_state * get_state_attn() const;
+    const llama_memory_recurrent_state * get_state_recr() const;
+
+private:
+    // the index of the next ubatch to process
+    size_t i_next = 0;
+
+    std::vector<llama_ubatch> ubatches;
+
+    const llama_memory_state_ptr state_attn;
+    const llama_memory_state_ptr state_recr;
+
+    const llama_memory_status status;
+};
--- a/examples/talk-llama/llama-memory-recurrent.cpp
+++ b/examples/talk-llama/llama-memory-recurrent.cpp
--- a/Show More
+++ b/Show More