bench : fix Windows linkage by moving ggml benches in whisper lib ..

bench : add memcpy and ggml_mul_mat benchmarks
whisper.android : add support for loading directly from asset in C (#415 )
2025-08-11 07:14:39 +02:00 · 2023-01-18 21:16:25 +02:00 · 2023-01-18 20:31:46 +02:00 · 2023-01-16 21:57:35 +02:00 · 2023-01-16 21:44:40 +02:00 · 2023-01-16 19:37:40 +02:00
132 changed files with 9406 additions and 2212 deletions
--- a/.github/workflows/bindings.yml
+++ b/.github/workflows/bindings.yml
@ -0,0 +1,17 @@
+name: Bindings Tests
+on:
+  push:
+    paths:
+      - bindings/go/**
+
+jobs:
+    ubuntu-latest:
+      runs-on: ubuntu-latest
+      steps:
+      - uses: actions/setup-go@v3
+        with:
+          go-version: '^1.19'
+      - uses: actions/checkout@v1
+      - run: |
+          cd bindings/go
+          make test
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -119,7 +119,59 @@ jobs:

        strategy:
            matrix:
-                build: [RelWithDebInfo]
+                build: [Release]
+                arch: [Win32, x64]
+                sdl2: [ON]
+                include:
+                  - arch: Win32
+                    s2arc: x86
+                  - arch: x64
+                    s2arc: x64
+                  - sdl2: ON
+                    s2ver: 2.26.0
+
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1
+
+            - name: Add msbuild to PATH
+              uses: microsoft/setup-msbuild@v1
+
+            - name: Fetch SDL2 and set SDL2_DIR
+              if: matrix.sdl2 == 'ON'
+              run: |
+                C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
+                7z x sdl2.zip
+                echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
+
+            - name: Configure
+              run: >
+                cmake -S . -B ./build -A ${{ matrix.arch }}
+                -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+                -DWHISPER_SUPPORT_SDL2=${{ matrix.sdl2 }}
+
+            - name: Build
+              run: |
+                cd ./build
+                msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+
+            - name: Copy SDL2.dll
+              if: matrix.sdl2 == 'ON'
+              run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
+
+            - name: Upload binaries
+              if: matrix.sdl2 == 'ON'
+              uses: actions/upload-artifact@v1
+              with:
+                name: whisper-bin-${{ matrix.arch }}
+                path: build/bin/${{ matrix.build }}
+
+    windows-blas:
+        runs-on: windows-latest
+
+        strategy:
+            matrix:
+                build: [Release]
                arch: [Win32, x64]
                blas: [ON]
                sdl2: [ON]
@ -181,5 +233,35 @@ jobs:
              if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
              uses: actions/upload-artifact@v1
              with:
-                name: whisper-bin-${{ matrix.arch }}
+                name: whisper-blas-bin-${{ matrix.arch }}
                path: build/bin/${{ matrix.build }}
+
+    emscripten:
+        runs-on: ubuntu-latest
+
+        strategy:
+            matrix:
+                build: [Release]
+
+        steps:
+            - name: Clone
+              uses: actions/checkout@v1
+
+            - name: Dependencies
+              run: |
+                wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
+                tar -xvf master.tar.gz
+                emsdk-master/emsdk update
+                emsdk-master/emsdk install latest
+                emsdk-master/emsdk activate latest
+
+            - name: Configure
+              run: echo "tmp"
+
+            - name: Build
+              run: |
+                pushd emsdk-master
+                source ./emsdk_env.sh
+                popd
+                emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+                make
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,7 @@ build/
 build-em/
 build-debug/
 build-release/
+build-static/
 build-sanitize-addr/
 build-sanitize-thread/

@ -18,6 +19,7 @@ build-sanitize-thread/
 /bench

 sync.sh
+libwhisper.a
 libwhisper.so
 compile_commands.json

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,14 +1,16 @@
 cmake_minimum_required (VERSION 3.0)
-project(whisper.cpp VERSION 1.0.3)

-set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
+project(whisper.cpp VERSION 1.1.0)
+
+# Add path to modules
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
+
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")

 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(WHISPER_STANDALONE ON)
-    include(cmake/GitVars.cmake)
-    include(cmake/BuildTypes.cmake)
+    include(GitVars)
+    include(BuildTypes)

    # configure project version
    if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
@ -51,6 +53,7 @@ if (APPLE)
    option(WHISPER_NO_ACCELERATE       "whisper: disable Accelerate framework" OFF)
    option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
    option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
+    option(WHISPER_NO_FMA              "whisper: disable FMA" OFF)
 else()
    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
@ -81,9 +84,6 @@ endif()

 # dependencies

-set(CMAKE_C_STANDARD   11)
-set(CMAKE_CXX_STANDARD 11)
-
 find_package(Threads REQUIRED)

 # on APPLE - include Accelerate framework
@ -130,6 +130,13 @@ if (WHISPER_ALL_WARNINGS)
            -Wcast-qual                     \
            -Wstrict-prototypes             \
            -Wpointer-arith                 \
+            -Wno-unused-function            \
+        ")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
+            -Wall                           \
+            -Wextra                         \
+            -Wpedantic                      \
+            -Wcast-qual                     \
        ")
    else()
        # todo : msvc
@ -150,6 +157,7 @@ else()
    if (MSVC)
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
    else()
        if (EMSCRIPTEN)
            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread")
@ -161,7 +169,10 @@ else()
            if(NOT WHISPER_NO_AVX2)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
            endif()
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma -mf16c")
+            if(NOT WHISPER_NO_FMA)
+                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
+            endif()
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
        endif()
    endif()
 endif()
@ -177,10 +188,14 @@ endif()
 set(TARGET whisper)

 add_library(${TARGET}
+    ggml.h
    ggml.c
+    whisper.h
    whisper.cpp
    )

+include(DefaultTargetOptions)
+
 target_include_directories(${TARGET} PUBLIC
    .
    )
@ -214,6 +229,7 @@ target_compile_definitions(${TARGET} PUBLIC
 install(TARGETS ${TARGET}
    LIBRARY DESTINATION lib
    ARCHIVE DESTINATION lib/static
+    RUNTIME DESTINATION bin
    )

 #
--- a/39
+++ b/39
@ -10,6 +10,9 @@ ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif

+CCV := $(shell $(CC) --version | head -n 1)
+CXXV := $(shell $(CXX) --version | head -n 1)
+
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
@ -53,10 +56,13 @@ endif
 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
-ifeq ($(UNAME_M),x86_64)
+ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	ifeq ($(UNAME_S),Darwin)
-		CFLAGS += -mfma -mf16c
+		CFLAGS += -mf16c
 		AVX1_M := $(shell sysctl machdep.cpu.features)
+		ifneq (,$(findstring FMA,$(AVX1_M)))
+			CFLAGS += -mfma
+		endif
 		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
 			CFLAGS += -mavx
 		endif
@ -81,6 +87,10 @@ ifeq ($(UNAME_M),x86_64)
 		ifneq (,$(findstring f16c,$(F16C_M)))
 			CFLAGS += -mf16c
 		endif
+		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
+		ifneq (,$(findstring sse3,$(SSE3_M)))
+			CFLAGS += -msse3
+		endif
 	else ifeq ($(UNAME_S),Haiku)
 		AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
 		ifneq (,$(findstring avx,$(AVX1_M)))
@ -105,6 +115,12 @@ endif
 ifeq ($(UNAME_M),amd64)
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif
+ifeq ($(UNAME_M),ppc64le)
+	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
+	ifneq (,$(findstring POWER9,$(POWER9_M)))
+		CFLAGS += -mpower9-vector
+	endif
+endif
 ifndef WHISPER_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework
 	ifeq ($(UNAME_S),Darwin)
@ -117,8 +133,8 @@ ifdef WHISPER_OPENBLAS
 	LDFLAGS += -lopenblas
 endif
 ifdef WHISPER_GPROF
-	CFLAGS  += -pg
-	CXXFLAGS  += -pg
+	CFLAGS   += -pg
+	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 endif
@ -135,6 +151,21 @@ ifneq ($(filter armv8%,$(UNAME_M)),)
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
 endif

+#
+# Print build information
+#
+
+$(info I whisper.cpp build info: )
+$(info I UNAME_S:  $(UNAME_S))
+$(info I UNAME_P:  $(UNAME_P))
+$(info I UNAME_M:  $(UNAME_M))
+$(info I CFLAGS:   $(CFLAGS))
+$(info I CXXFLAGS: $(CXXFLAGS))
+$(info I LDFLAGS:  $(LDFLAGS))
+$(info I CC:       $(CCV))
+$(info I CXX:      $(CXXV))
+$(info )
+
 default: main

 #
--- a/README.md
+++ b/README.md
@ -4,11 +4,14 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

+Stable: [v1.0.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.0.4) / Beta: [v1.1.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.1.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

 - Plain C/C++ implementation without dependencies
 - Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
 - AVX intrinsics support for x86 architectures
+- VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
 - Low memory usage (Flash Attention + Flash Forward)
 - Zero memory allocations at runtime
@ -19,11 +22,11 @@ Supported platforms:

 - [x] Mac OS (Intel and Arm)
 - [x] [iOS](examples/whisper.objc)
- [x] Linux
+- [x] [Android](examples/whisper.android)
+- [x] Linux / [FreeBSD](https://github.com/ggerganov/whisper.cpp/issues/56#issuecomment-1350920264)
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
- [x] [Android](https://github.com/ggerganov/whisper.cpp/issues/30)

 The entire implementation of the model is contained in 2 source files:

@ -68,7 +71,7 @@ Now build the [main](examples/main) example and transcribe an audio file like th
 make

 # transcribe an audio file
-./main -f input.wav
+./main -f samples/jfk.wav
 ```

 ---
@ -86,27 +89,36 @@ c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o gg
 usage: ./main [options] file0.wav file1.wav ...

 options:
-  -h,       --help          [default] show this help message and exit
-  -t N,     --threads N     [4      ] number of threads to use during computation
-  -p N,     --processors N  [1      ] number of processors to use during computation
-  -ot N,    --offset-t N    [0      ] time offset in milliseconds
-  -on N,    --offset-n N    [0      ] segment index offset
-  -d  N,    --duration N    [0      ] duration of audio to process in milliseconds
-  -mc N,    --max-context N [-1     ] maximum number of text context tokens to store
-  -ml N,    --max-len N     [0      ] maximum segment length in characters
-  -wt N,    --word-thold N  [0.01   ] word timestamp probability threshold
-  -su,      --speed-up      [false  ] speed up audio by x2 (reduced accuracy)
-  -tr,      --translate     [false  ] translate from source language to english
-  -otxt,    --output-txt    [false  ] output result in a text file
-  -ovtt,    --output-vtt    [false  ] output result in a vtt file
-  -osrt,    --output-srt    [false  ] output result in a srt file
-  -owts,    --output-words  [false  ] output script for generating karaoke video
-  -ps,      --print-special [false  ] print special tokens
-  -pc,      --print-colors  [false  ] print colors
-  -nt,      --no-timestamps [true   ] do not print timestamps
-  -l LANG,  --language LANG [en     ] spoken language
-  -m FNAME, --model FNAME   [models/ggml-base.en.bin] model path
-  -f FNAME, --file FNAME    [       ] input WAV file path
+  -h,       --help            [default] show this help message and exit
+  -t N,     --threads N       [4      ] number of threads to use during computation
+  -p N,     --processors N    [1      ] number of processors to use during computation
+  -ot N,    --offset-t N      [0      ] time offset in milliseconds
+  -on N,    --offset-n N      [0      ] segment index offset
+  -d  N,    --duration N      [0      ] duration of audio to process in milliseconds
+  -mc N,    --max-context N   [-1     ] maximum number of text context tokens to store
+  -ml N,    --max-len N       [0      ] maximum segment length in characters
+  -bo N,    --best-of N       [5      ] number of best candidates to keep
+  -bs N,    --beam-size N     [-1     ] beam size for beam search
+  -wt N,    --word-thold N    [0.01   ] word timestamp probability threshold
+  -et N,    --entropy-thold N [2.40   ] entropy threshold for decoder fail
+  -lpt N,   --logprob-thold N [-1.00  ] log probability threshold for decoder fail
+  -su,      --speed-up        [false  ] speed up audio by x2 (reduced accuracy)
+  -tr,      --translate       [false  ] translate from source language to english
+  -di,      --diarize         [false  ] stereo audio diarization
+  -otxt,    --output-txt      [false  ] output result in a text file
+  -ovtt,    --output-vtt      [false  ] output result in a vtt file
+  -osrt,    --output-srt      [false  ] output result in a srt file
+  -owts,    --output-words    [false  ] output script for generating karaoke video
+  -ocsv,    --output-csv      [false  ] output result in a CSV file
+  -ps,      --print-special   [false  ] print special tokens
+  -pc,      --print-colors    [false  ] print colors
+  -pp,      --print-progress  [false  ] print progress
+  -nt,      --no-timestamps   [true   ] do not print timestamps
+  -l LANG,  --language LANG   [en     ] spoken language ('auto' for auto-detect)
+            --prompt PROMPT   [       ] initial prompt
+  -m FNAME, --model FNAME     [models/ggml-base.en.bin] model path
+  -f FNAME, --file FNAME      [       ] input WAV file path
+

 bash ./models/download-ggml-model.sh base.en
 Downloading ggml model base.en ...
@ -209,17 +221,7 @@ make large
 ## Limitations

 - Inference only
- No GPU support
- Very basic greedy sampling scheme - always pick up the token with highest probability.
-  This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
-  from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
-  to run the python code with the following parameters:
-
-  ```
-  whisper --best_of None --beam_size None ...
-  ```
-
-  In the future, `whisper.cpp` will support more sampling strategies.
+- No GPU support (yet)

 ## Another example

@ -304,6 +306,7 @@ The [stream](examples/stream) tool samples the audio every half a second and run
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

 ```java
+make stream
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```

@ -445,12 +448,13 @@ or manually from here:
 For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
 in [models](models).

-## Bindings
+## [Bindings](https://github.com/ggerganov/whisper.cpp/discussions/categories/bindings)

- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs)
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm)
- [X] Javascript: [bindings/javascript](bindings/javascript)
- [ ] Python: soon
+- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
+- [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
+- [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
+- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
+- [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)

 ## Examples

@ -465,6 +469,8 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
 | [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
 | [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
 | [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
+| [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
+| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
 | [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
 | [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
 | [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
--- a/bindings/go/.gitignore
+++ b/bindings/go/.gitignore
@ -0,0 +1,2 @@
+build
+models
--- a/bindings/go/LICENSE
+++ b/bindings/go/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2022 David Thorpe
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/bindings/go/Makefile
+++ b/bindings/go/Makefile
@ -0,0 +1,38 @@
+BUILD_DIR := build
+MODELS_DIR := models
+EXAMPLES_DIR := $(wildcard examples/*)
+INCLUDE_PATH := $(abspath ../..)
+LIBRARY_PATH := $(abspath ../..)
+
+all: clean whisper examples
+
+whisper: mkdir
+	@echo Build whisper
+	@${MAKE} -C ../.. libwhisper.a
+
+test: model-small whisper modtidy
+	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v .
+	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v ./pkg/whisper/...
+
+examples: $(EXAMPLES_DIR)
+
+model-small: mkdir examples/go-model-download
+	@${BUILD_DIR}/go-model-download -out models ggml-small.en.bin
+
+$(EXAMPLES_DIR): mkdir whisper modtidy
+	@echo Build example $(notdir $@)
+	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go build ${BUILD_FLAGS} -o ${BUILD_DIR}/$(notdir $@) ./$@
+
+mkdir:
+	@echo Mkdir ${BUILD_DIR}
+	@install -d ${BUILD_DIR}
+	@echo Mkdir ${MODELS_DIR}
+	@install -d ${MODELS_DIR}
+
+modtidy:
+	@go mod tidy
+
+clean: 
+	@echo Clean
+	@rm -fr $(BUILD_DIR)
+	@go clean
--- a/bindings/go/README.md
+++ b/bindings/go/README.md
@ -0,0 +1,100 @@
+# Go bindings for Whisper
+
+This package provides Go bindings for whisper.cpp. They have been tested on:
+
+  * Darwin (OS X) 12.6 on x64_64
+  * Debian Linux on arm64
+  * Fedora Linux on x86_64
+
+The "low level" bindings are in the `bindings/go` directory and there is a more
+Go-style package in the `bindings/go/pkg/whisper` directory. The most simple usage
+is as follows:
+
+```go
+import (
+	"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+)
+
+func main() {
+	var modelpath string // Path to the model
+	var samples []float32 // Samples to process
+
+	// Load the model
+	model, err := whisper.New(modelpath)
+	if err != nil {
+		panic(err)
+	}
+	defer model.Close()
+
+	// Process samples
+	context, err := model.NewContext()
+	if err != nil {
+		panic(err)
+	}
+	if err := context.Process(samples, nil); err != nil {
+		return err
+	}
+
+	// Print out the results
+	for {
+		segment, err := context.NextSegment()
+		if err != nil {
+			break
+		}
+		fmt.Printf("[%6s->%6s] %s\n", segment.Start, segment.End, segment.Text)
+	}
+}
+```
+
+## Building & Testing
+
+In order to build, you need to have the Go compiler installed. You can get it from [here](https://golang.org/dl/). Run the tests with:
+
+```bash
+git clone https://github.com/ggerganov/whisper.cpp.git
+cd whisper.cpp/bindings/go
+make test
+```
+
+This will compile a static `libwhisper.a` in a `build` folder, download a model file, then run the tests. To build the examples:
+
+```bash
+make examples
+```
+
+The examples are placed in the `build` directory. Once built, you can download all the models with the following command:
+
+```bash
+./build/go-model-download -out models
+```
+
+And you can then test a model against samples with the following command:
+
+```bash
+./build/go-whisper -model models/ggml-tiny.en.bin samples/jfk.wav 
+```
+
+## Using the bindings
+
+To use the bindings in your own software,
+
+  1. Import `github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper` (or `github.com/ggerganov/whisper.cpp/bindings/go` into your package;
+  2. Compile `libwhisper.a` (you can use `make whisper` in the `bindings/go` directory);
+  3. Link your go binary against whisper by setting the environment variables `C_INCLUDE_PATH` and `LIBRARY_PATH`
+     to point to the `whisper.h` file directory and `libwhisper.a` file directory respectively.
+
+Look at the `Makefile` in the `bindings/go` directory for an example.
+
+The API Documentation:
+
+  * https://pkg.go.dev/github.com/ggerganov/whisper.cpp/bindings/go
+  * https://pkg.go.dev/github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper
+
+Getting help:
+
+  * Follow the discussion for the go bindings [here](https://github.com/ggerganov/whisper.cpp/discussions/312)
+
+## License
+
+The license for the Go bindings is the same as the license for the rest of the whisper.cpp project, which is the MIT License. See the `LICENSE` file for more details.
+
--- a/bindings/go/doc.go
+++ b/bindings/go/doc.go
@ -0,0 +1,5 @@
+/*
+github.com/ggerganov/whisper.cpp/bindings/go
+provides a speech-to-text service bindings for the Go programming language.
+*/
+package whisper
--- a/bindings/go/examples/go-model-download/context.go
+++ b/bindings/go/examples/go-model-download/context.go
@ -0,0 +1,30 @@
+package main
+
+import (
+	"context"
+	"os"
+	"os/signal"
+)
+
+// ContextForSignal returns a context object which is cancelled when a signal
+// is received. It returns nil if no signal parameter is provided
+func ContextForSignal(signals ...os.Signal) context.Context {
+	if len(signals) == 0 {
+		return nil
+	}
+
+	ch := make(chan os.Signal)
+	ctx, cancel := context.WithCancel(context.Background())
+
+	// Send message on channel when signal received
+	signal.Notify(ch, signals...)
+
+	// When any signal received, call cancel
+	go func() {
+		<-ch
+		cancel()
+	}()
+
+	// Return success
+	return ctx
+}
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -0,0 +1,208 @@
+package main
+
+import (
+	"context"
+	"flag"
+	"fmt"
+	"io"
+	"net/http"
+	"net/url"
+	"os"
+	"path/filepath"
+	"syscall"
+	"time"
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// CONSTANTS
+
+const (
+	srcUrl  = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main" // The location of the models
+	srcExt  = ".bin"                                                               // Filename extension
+	bufSize = 1024 * 64                                                            // Size of the buffer used for downloading the model
+)
+
+var (
+	// The models which will be downloaded, if no model is specified as an argument
+	modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large"}
+)
+
+var (
+	// The output folder. When not set, use current working directory.
+	flagOut = flag.String("out", "", "Output folder")
+
+	// HTTP timeout parameter - will timeout if takes longer than this to download a model
+	flagTimeout = flag.Duration("timeout", 30*time.Minute, "HTTP timeout")
+
+	// Quiet parameter - will not print progress if set
+	flagQuiet = flag.Bool("quiet", false, "Quiet mode")
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// MAIN
+
+func main() {
+	flag.Usage = func() {
+		name := filepath.Base(flag.CommandLine.Name())
+		fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [options] <model>\n\n", name)
+		flag.PrintDefaults()
+	}
+	flag.Parse()
+
+	// Get output path
+	out, err := GetOut()
+	if err != nil {
+		fmt.Fprintln(os.Stderr, "Error:", err)
+		os.Exit(-1)
+	}
+
+	// Create context which quits on SIGINT or SIGQUIT
+	ctx := ContextForSignal(os.Interrupt, syscall.SIGQUIT)
+
+	// Progress filehandle
+	progress := os.Stdout
+	if *flagQuiet {
+		progress, err = os.Open(os.DevNull)
+		if err != nil {
+			fmt.Fprintln(os.Stderr, "Error:", err)
+			os.Exit(-1)
+		}
+		defer progress.Close()
+	}
+
+	// Download models - exit on error or interrupt
+	for _, model := range GetModels() {
+		url, err := URLForModel(model)
+		if err != nil {
+			fmt.Fprintln(os.Stderr, "Error:", err)
+			continue
+		} else if path, err := Download(ctx, progress, url, out); err == nil || err == io.EOF {
+			continue
+		} else if err == context.Canceled {
+			os.Remove(path)
+			fmt.Fprintln(progress, "\nInterrupted")
+			break
+		} else if err == context.DeadlineExceeded {
+			os.Remove(path)
+			fmt.Fprintln(progress, "Timeout downloading model")
+			continue
+		} else {
+			os.Remove(path)
+			fmt.Fprintln(os.Stderr, "Error:", err)
+			break
+		}
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// PUBLIC METHODS
+
+// GetOut returns the path to the output directory
+func GetOut() (string, error) {
+	if *flagOut == "" {
+		return os.Getwd()
+	}
+	if info, err := os.Stat(*flagOut); err != nil {
+		return "", err
+	} else if !info.IsDir() {
+		return "", fmt.Errorf("not a directory: %s", info.Name())
+	} else {
+		return *flagOut, nil
+	}
+}
+
+// GetModels returns the list of models to download
+func GetModels() []string {
+	if flag.NArg() == 0 {
+		return modelNames
+	} else {
+		return flag.Args()
+	}
+}
+
+// URLForModel returns the URL for the given model on huggingface.co
+func URLForModel(model string) (string, error) {
+	if filepath.Ext(model) != srcExt {
+		model += srcExt
+	}
+	url, err := url.Parse(srcUrl)
+	if err != nil {
+		return "", err
+	} else {
+		url.Path = filepath.Join(url.Path, model)
+	}
+	return url.String(), nil
+}
+
+// Download downloads the model from the given URL to the given output directory
+func Download(ctx context.Context, p io.Writer, model, out string) (string, error) {
+	// Create HTTP client
+	client := http.Client{
+		Timeout: *flagTimeout,
+	}
+
+	// Initiate the download
+	req, err := http.NewRequest("GET", model, nil)
+	if err != nil {
+		return "", err
+	}
+	resp, err := client.Do(req)
+	if err != nil {
+		return "", err
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != http.StatusOK {
+		return "", fmt.Errorf("%s: %s", model, resp.Status)
+	}
+
+	// If output file exists and is the same size as the model, skip
+	path := filepath.Join(out, filepath.Base(model))
+	if info, err := os.Stat(path); err == nil && info.Size() == resp.ContentLength {
+		fmt.Fprintln(p, "Skipping", model, "as it already exists")
+		return "", nil
+	}
+
+	// Create file
+	w, err := os.Create(path)
+	if err != nil {
+		return "", err
+	}
+	defer w.Close()
+
+	// Report
+	fmt.Fprintln(p, "Downloading", model, "to", out)
+
+	// Progressively download the model
+	data := make([]byte, bufSize)
+	count, pct := int64(0), int64(0)
+	ticker := time.NewTicker(5 * time.Second)
+	for {
+		select {
+		case <-ctx.Done():
+			// Cancelled, return error
+			return path, ctx.Err()
+		case <-ticker.C:
+			pct = DownloadReport(p, pct, count, resp.ContentLength)
+		default:
+			// Read body
+			n, err := resp.Body.Read(data)
+			if err != nil {
+				DownloadReport(p, pct, count, resp.ContentLength)
+				return path, err
+			} else if m, err := w.Write(data[:n]); err != nil {
+				return path, err
+			} else {
+				count += int64(m)
+			}
+		}
+	}
+}
+
+// Report periodically reports the download progress when percentage changes
+func DownloadReport(w io.Writer, pct, count, total int64) int64 {
+	pct_ := count * 100 / total
+	if pct_ > pct {
+		fmt.Fprintf(w, "  ...%d MB written (%d%%)\n", count/1e6, pct_)
+	}
+	return pct_
+}
--- a/bindings/go/examples/go-whisper/color.go
+++ b/bindings/go/examples/go-whisper/color.go
@ -0,0 +1,22 @@
+package main
+
+import "fmt"
+
+///////////////////////////////////////////////////////////////////////////////
+// CONSTANTS
+
+const (
+	Reset     = "\033[0m"
+	RGBPrefix = "\033[38;5;" // followed by RGB values in decimal format separated by colons
+	RGBSuffix = "m"
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// PUBLIC METHODS
+
+// Colorize text with RGB values, from 0 to 23
+func Colorize(text string, v int) string {
+	// https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit
+	// Grayscale colors are in the range 232-255
+	return RGBPrefix + fmt.Sprint(v%24+232) + RGBSuffix + text + Reset
+}
--- a/bindings/go/examples/go-whisper/flags.go
+++ b/bindings/go/examples/go-whisper/flags.go
@ -0,0 +1,156 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"strings"
+	"time"
+
+	// Packages
+	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// TYPES
+
+type Flags struct {
+	*flag.FlagSet
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// LIFECYCLE
+
+func NewFlags(name string, args []string) (*Flags, error) {
+	flags := &Flags{
+		FlagSet: flag.NewFlagSet(name, flag.ContinueOnError),
+	}
+
+	// Register the command line arguments
+	registerFlags(flags)
+
+	// Parse command line
+	if err := flags.Parse(args); err != nil {
+		return nil, err
+	}
+
+	// Return success
+	return flags, nil
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// PUBLIC METHODS
+
+func (flags *Flags) GetModel() string {
+	return flags.Lookup("model").Value.String()
+}
+
+func (flags *Flags) GetLanguage() string {
+	return flags.Lookup("language").Value.String()
+}
+
+func (flags *Flags) IsTranslate() bool {
+	return flags.Lookup("translate").Value.(flag.Getter).Get().(bool)
+}
+
+func (flags *Flags) GetOffset() time.Duration {
+	return flags.Lookup("offset").Value.(flag.Getter).Get().(time.Duration)
+}
+
+func (flags *Flags) GetDuration() time.Duration {
+	return flags.Lookup("duration").Value.(flag.Getter).Get().(time.Duration)
+}
+
+func (flags *Flags) GetThreads() uint {
+	return flags.Lookup("threads").Value.(flag.Getter).Get().(uint)
+}
+
+func (flags *Flags) GetOut() string {
+	return strings.ToLower(flags.Lookup("out").Value.String())
+}
+
+func (flags *Flags) IsSpeedup() bool {
+	return flags.Lookup("speedup").Value.String() == "true"
+}
+
+func (flags *Flags) IsTokens() bool {
+	return flags.Lookup("tokens").Value.String() == "true"
+}
+
+func (flags *Flags) IsColorize() bool {
+	return flags.Lookup("colorize").Value.String() == "true"
+}
+
+func (flags *Flags) GetMaxLen() uint {
+	return flags.Lookup("max-len").Value.(flag.Getter).Get().(uint)
+}
+
+func (flags *Flags) GetMaxTokens() uint {
+	return flags.Lookup("max-tokens").Value.(flag.Getter).Get().(uint)
+}
+
+func (flags *Flags) GetWordThreshold() float32 {
+	return float32(flags.Lookup("word-thold").Value.(flag.Getter).Get().(float64))
+}
+
+func (flags *Flags) SetParams(context whisper.Context) error {
+	if lang := flags.GetLanguage(); lang != "" && lang != "auto" {
+		fmt.Fprintf(flags.Output(), "Setting language to %q\n", lang)
+		if err := context.SetLanguage(lang); err != nil {
+			return err
+		}
+	}
+	if flags.IsTranslate() && context.IsMultilingual() {
+		fmt.Fprintf(flags.Output(), "Setting translate to true\n")
+		context.SetTranslate(true)
+	}
+	if offset := flags.GetOffset(); offset != 0 {
+		fmt.Fprintf(flags.Output(), "Setting offset to %v\n", offset)
+		context.SetOffset(offset)
+	}
+	if duration := flags.GetDuration(); duration != 0 {
+		fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
+		context.SetDuration(duration)
+	}
+	if flags.IsSpeedup() {
+		fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
+		context.SetSpeedup(true)
+	}
+	if threads := flags.GetThreads(); threads != 0 {
+		fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
+		context.SetThreads(threads)
+	}
+	if max_len := flags.GetMaxLen(); max_len != 0 {
+		fmt.Fprintf(flags.Output(), "Setting max_segment_length to %d\n", max_len)
+		context.SetMaxSegmentLength(max_len)
+	}
+	if max_tokens := flags.GetMaxTokens(); max_tokens != 0 {
+		fmt.Fprintf(flags.Output(), "Setting max_tokens to %d\n", max_tokens)
+		context.SetMaxTokensPerSegment(max_tokens)
+	}
+	if word_threshold := flags.GetWordThreshold(); word_threshold != 0 {
+		fmt.Fprintf(flags.Output(), "Setting word_threshold to %f\n", word_threshold)
+		context.SetTokenThreshold(word_threshold)
+	}
+
+	// Return success
+	return nil
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// PRIVATE METHODS
+
+func registerFlags(flag *Flags) {
+	flag.String("model", "", "Path to the model file")
+	flag.String("language", "", "Spoken language")
+	flag.Bool("translate", false, "Translate from source language to english")
+	flag.Duration("offset", 0, "Time offset")
+	flag.Duration("duration", 0, "Duration of audio to process")
+	flag.Uint("threads", 0, "Number of threads to use")
+	flag.Bool("speedup", false, "Enable speedup")
+	flag.Uint("max-len", 0, "Maximum segment length in characters")
+	flag.Uint("max-tokens", 0, "Maximum tokens per segment")
+	flag.Float64("word-thold", 0, "Maximum segment score")
+	flag.Bool("tokens", false, "Display tokens")
+	flag.Bool("colorize", false, "Colorize tokens")
+	flag.String("out", "", "Output format (srt, none or leave as empty string)")
+}
--- a/bindings/go/examples/go-whisper/main.go
+++ b/bindings/go/examples/go-whisper/main.go
@ -0,0 +1,43 @@
+package main
+
+import (
+	"flag"
+	"fmt"
+	"os"
+	"path/filepath"
+
+	// Packages
+	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+)
+
+func main() {
+	flags, err := NewFlags(filepath.Base(os.Args[0]), os.Args[1:])
+	if err == flag.ErrHelp {
+		os.Exit(0)
+	} else if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	} else if flags.GetModel() == "" {
+		fmt.Fprintln(os.Stderr, "Use -model flag to specify which model file to use")
+		os.Exit(1)
+	} else if flags.NArg() == 0 {
+		fmt.Fprintln(os.Stderr, "No input files specified")
+		os.Exit(1)
+	}
+
+	// Load model
+	model, err := whisper.New(flags.GetModel())
+	if err != nil {
+		fmt.Fprintln(os.Stderr, err)
+		os.Exit(1)
+	}
+	defer model.Close()
+
+	// Process files
+	for _, filename := range flags.Args() {
+		if err := Process(model, filename, flags); err != nil {
+			fmt.Fprintln(os.Stderr, err)
+			continue
+		}
+	}
+}
--- a/bindings/go/examples/go-whisper/process.go
+++ b/bindings/go/examples/go-whisper/process.go
@ -0,0 +1,127 @@
+package main
+
+import (
+	"fmt"
+	"io"
+	"os"
+	"time"
+
+	// Package imports
+	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	wav "github.com/go-audio/wav"
+)
+
+func Process(model whisper.Model, path string, flags *Flags) error {
+	var data []float32
+
+	// Create processing context
+	context, err := model.NewContext()
+	if err != nil {
+		return err
+	}
+
+	// Set the parameters
+	if err := flags.SetParams(context); err != nil {
+		return err
+	}
+
+	// Open the file
+	fmt.Fprintf(flags.Output(), "Loading %q\n", path)
+	fh, err := os.Open(path)
+	if err != nil {
+		return err
+	}
+	defer fh.Close()
+
+	// Decode the WAV file - load the full buffer
+	dec := wav.NewDecoder(fh)
+	if buf, err := dec.FullPCMBuffer(); err != nil {
+		return err
+	} else if dec.SampleRate != whisper.SampleRate {
+		return fmt.Errorf("unsupported sample rate: %d", dec.SampleRate)
+	} else if dec.NumChans != 1 {
+		return fmt.Errorf("unsupported number of channels: %d", dec.NumChans)
+	} else {
+		data = buf.AsFloat32Buffer().Data
+	}
+
+	// Segment callback when -tokens is specified
+	var cb whisper.SegmentCallback
+	if flags.IsTokens() {
+		cb = func(segment whisper.Segment) {
+			fmt.Fprintf(flags.Output(), "%02d [%6s->%6s] ", segment.Num, segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond))
+			for _, token := range segment.Tokens {
+				if flags.IsColorize() && context.IsText(token) {
+					fmt.Fprint(flags.Output(), Colorize(token.Text, int(token.P*24.0)), " ")
+				} else {
+					fmt.Fprint(flags.Output(), token.Text, " ")
+				}
+			}
+			fmt.Fprintln(flags.Output(), "")
+			fmt.Fprintln(flags.Output(), "")
+		}
+	}
+
+	// Process the data
+	fmt.Fprintf(flags.Output(), "  ...processing %q\n", path)
+	if err := context.Process(data, cb); err != nil {
+		return err
+	}
+
+	// Print out the results
+	switch {
+	case flags.GetOut() == "srt":
+		return OutputSRT(os.Stdout, context)
+	case flags.GetOut() == "none":
+		return nil
+	default:
+		return Output(os.Stdout, context, flags.IsColorize())
+	}
+}
+
+// Output text as SRT file
+func OutputSRT(w io.Writer, context whisper.Context) error {
+	n := 1
+	for {
+		segment, err := context.NextSegment()
+		if err == io.EOF {
+			return nil
+		} else if err != nil {
+			return err
+		}
+		fmt.Fprintln(w, n)
+		fmt.Fprintln(w, srtTimestamp(segment.Start), " --> ", srtTimestamp(segment.End))
+		fmt.Fprintln(w, segment.Text)
+		fmt.Fprintln(w, "")
+		n++
+	}
+}
+
+// Output text to terminal
+func Output(w io.Writer, context whisper.Context, colorize bool) error {
+	for {
+		segment, err := context.NextSegment()
+		if err == io.EOF {
+			return nil
+		} else if err != nil {
+			return err
+		}
+		fmt.Fprintf(w, "[%6s->%6s]", segment.Start.Truncate(time.Millisecond), segment.End.Truncate(time.Millisecond))
+		if colorize {
+			for _, token := range segment.Tokens {
+				if !context.IsText(token) {
+					continue
+				}
+				fmt.Fprint(w, " ", Colorize(token.Text, int(token.P*24.0)))
+			}
+			fmt.Fprint(w, "\n")
+		} else {
+			fmt.Fprintln(w, " ", segment.Text)
+		}
+	}
+}
+
+// Return srtTimestamp
+func srtTimestamp(t time.Duration) string {
+	return fmt.Sprintf("%02d:%02d:%02d,%03d", t/time.Hour, (t%time.Hour)/time.Minute, (t%time.Minute)/time.Second, (t%time.Second)/time.Millisecond)
+}
--- a/bindings/go/go.mod
+++ b/bindings/go/go.mod
@ -0,0 +1,16 @@
+module github.com/ggerganov/whisper.cpp/bindings/go
+
+go 1.19
+
+require (
+	github.com/go-audio/wav v1.1.0
+	github.com/stretchr/testify v1.8.1
+)
+
+require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/go-audio/audio v1.0.0 // indirect
+	github.com/go-audio/riff v1.0.0 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+)
--- a/bindings/go/go.sum
+++ b/bindings/go/go.sum
@ -0,0 +1,23 @@
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
+github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
+github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
+github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
+github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
+github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -0,0 +1,156 @@
+package whisper
+
+import (
+	"fmt"
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// CGO
+
+/*
+#include <whisper.h>
+*/
+import "C"
+
+///////////////////////////////////////////////////////////////////////////////
+// PUBLIC METHODS
+
+func (p *Params) SetTranslate(v bool) {
+	p.translate = toBool(v)
+}
+
+func (p *Params) SetNoContext(v bool) {
+	p.no_context = toBool(v)
+}
+
+func (p *Params) SetSingleSegment(v bool) {
+	p.single_segment = toBool(v)
+}
+
+func (p *Params) SetPrintSpecial(v bool) {
+	p.print_special = toBool(v)
+}
+
+func (p *Params) SetPrintProgress(v bool) {
+	p.print_progress = toBool(v)
+}
+
+func (p *Params) SetPrintRealtime(v bool) {
+	p.print_realtime = toBool(v)
+}
+
+func (p *Params) SetPrintTimestamps(v bool) {
+	p.print_timestamps = toBool(v)
+}
+
+func (p *Params) SetSpeedup(v bool) {
+	p.speed_up = toBool(v)
+}
+
+// Set language id
+func (p *Params) SetLanguage(lang int) error {
+	str := C.whisper_lang_str(C.int(lang))
+	if str == nil {
+		return ErrInvalidLanguage
+	} else {
+		p.language = str
+	}
+	return nil
+}
+
+// Get language id
+func (p *Params) Language() int {
+	if p.language == nil {
+		return -1
+	}
+	return int(C.whisper_lang_id(p.language))
+}
+
+// Set number of threads to use
+func (p *Params) SetThreads(threads int) {
+	p.n_threads = C.int(threads)
+}
+
+// Set start offset in ms
+func (p *Params) SetOffset(offset_ms int) {
+	p.offset_ms = C.int(offset_ms)
+}
+
+// Set audio duration to process in ms
+func (p *Params) SetDuration(duration_ms int) {
+	p.duration_ms = C.int(duration_ms)
+}
+
+// Set timestamp token probability threshold (~0.01)
+func (p *Params) SetTokenThreshold(t float32) {
+	p.thold_pt = C.float(t)
+}
+
+// Set timestamp token sum probability threshold (~0.01)
+func (p *Params) SetTokenSumThreshold(t float32) {
+	p.thold_ptsum = C.float(t)
+}
+
+// Set max segment length in characters
+func (p *Params) SetMaxSegmentLength(n int) {
+	p.max_len = C.int(n)
+}
+
+// Set max tokens per segment (0 = no limit)
+func (p *Params) SetMaxTokensPerSegment(n int) {
+	p.max_tokens = C.int(n)
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// PRIVATE METHODS
+
+func toBool(v bool) C.bool {
+	if v {
+		return C.bool(true)
+	}
+	return C.bool(false)
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// STRINGIFY
+
+func (p *Params) String() string {
+	str := "<whisper.params"
+	str += fmt.Sprintf(" strategy=%v", p.strategy)
+	str += fmt.Sprintf(" n_threads=%d", p.n_threads)
+	if p.language != nil {
+		str += fmt.Sprintf(" language=%s", C.GoString(p.language))
+	}
+	str += fmt.Sprintf(" n_max_text_ctx=%d", p.n_max_text_ctx)
+	str += fmt.Sprintf(" offset_ms=%d", p.offset_ms)
+	str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
+	if p.translate {
+		str += " translate"
+	}
+	if p.no_context {
+		str += " no_context"
+	}
+	if p.single_segment {
+		str += " single_segment"
+	}
+	if p.print_special {
+		str += " print_special"
+	}
+	if p.print_progress {
+		str += " print_progress"
+	}
+	if p.print_realtime {
+		str += " print_realtime"
+	}
+	if p.print_timestamps {
+		str += " print_timestamps"
+	}
+	if p.token_timestamps {
+		str += " token_timestamps"
+	}
+	if p.speed_up {
+		str += " speed_up"
+	}
+
+	return str + ">"
+}
--- a/bindings/go/pkg/whisper/consts.go
+++ b/bindings/go/pkg/whisper/consts.go
@ -0,0 +1,28 @@
+package whisper
+
+import (
+	"errors"
+
+	// Bindings
+	whisper "github.com/ggerganov/whisper.cpp/bindings/go"
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// ERRORS
+
+var (
+	ErrUnableToLoadModel    = errors.New("unable to load model")
+	ErrInternalAppError     = errors.New("internal application error")
+	ErrProcessingFailed     = errors.New("processing failed")
+	ErrUnsupportedLanguage  = errors.New("unsupported language")
+	ErrModelNotMultilingual = errors.New("model is not multilingual")
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// CONSTANTS
+
+// SampleRate is the sample rate of the audio data.
+const SampleRate = whisper.SampleRate
+
+// SampleBits is the number of bytes per sample.
+const SampleBits = whisper.SampleBits
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -0,0 +1,251 @@
+package whisper
+
+import (
+	"io"
+	"strings"
+	"time"
+
+	// Bindings
+	whisper "github.com/ggerganov/whisper.cpp/bindings/go"
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// TYPES
+
+type context struct {
+	n      int
+	model  *model
+	params whisper.Params
+}
+
+// Make sure context adheres to the interface
+var _ Context = (*context)(nil)
+
+///////////////////////////////////////////////////////////////////////////////
+// LIFECYCLE
+
+func newContext(model *model, params whisper.Params) (Context, error) {
+	context := new(context)
+	context.model = model
+	context.params = params
+
+	// Return success
+	return context, nil
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// PUBLIC METHODS
+
+// Set the language to use for speech recognition.
+func (context *context) SetLanguage(lang string) error {
+	if context.model.ctx == nil {
+		return ErrInternalAppError
+	}
+	if !context.model.IsMultilingual() {
+		return ErrModelNotMultilingual
+	}
+	if id := context.model.ctx.Whisper_lang_id(lang); id < 0 {
+		return ErrUnsupportedLanguage
+	} else if err := context.params.SetLanguage(id); err != nil {
+		return err
+	}
+	// Return success
+	return nil
+}
+
+func (context *context) IsMultilingual() bool {
+	return context.model.IsMultilingual()
+}
+
+// Get language
+func (context *context) Language() string {
+	return whisper.Whisper_lang_str(context.params.Language())
+}
+
+// Set translate flag
+func (context *context) SetTranslate(v bool) {
+	context.params.SetTranslate(v)
+}
+
+// Set speedup flag
+func (context *context) SetSpeedup(v bool) {
+	context.params.SetSpeedup(v)
+}
+
+// Set number of threads to use
+func (context *context) SetThreads(v uint) {
+	context.params.SetThreads(int(v))
+}
+
+// Set time offset
+func (context *context) SetOffset(v time.Duration) {
+	context.params.SetOffset(int(v.Milliseconds()))
+}
+
+// Set duration of audio to process
+func (context *context) SetDuration(v time.Duration) {
+	context.params.SetOffset(int(v.Milliseconds()))
+}
+
+// Set timestamp token probability threshold (~0.01)
+func (context *context) SetTokenThreshold(t float32) {
+	context.params.SetTokenThreshold(t)
+}
+
+// Set timestamp token sum probability threshold (~0.01)
+func (context *context) SetTokenSumThreshold(t float32) {
+	context.params.SetTokenSumThreshold(t)
+}
+
+// Set max segment length in characters
+func (context *context) SetMaxSegmentLength(n uint) {
+	context.params.SetMaxSegmentLength(int(n))
+}
+
+// Set max tokens per segment (0 = no limit)
+func (context *context) SetMaxTokensPerSegment(n uint) {
+	context.params.SetMaxTokensPerSegment(int(n))
+}
+
+// Process new sample data and return any errors
+func (context *context) Process(data []float32, cb SegmentCallback) error {
+	if context.model.ctx == nil {
+		return ErrInternalAppError
+	}
+	// If the callback is defined then we force on single_segment mode
+	if cb != nil {
+		context.params.SetSingleSegment(true)
+	}
+
+	// We don't do parallel processing at the moment
+	processors := 0
+	if processors > 1 {
+		if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, nil, func(new int) {
+			if cb != nil {
+				num_segments := context.model.ctx.Whisper_full_n_segments()
+				s0 := num_segments - new
+				for i := s0; i < num_segments; i++ {
+					cb(toSegment(context.model.ctx, i))
+				}
+			}
+		}); err != nil {
+			return err
+		}
+	} else if err := context.model.ctx.Whisper_full(context.params, data, nil, func(new int) {
+		if cb != nil {
+			num_segments := context.model.ctx.Whisper_full_n_segments()
+			s0 := num_segments - new
+			for i := s0; i < num_segments; i++ {
+				cb(toSegment(context.model.ctx, i))
+			}
+		}
+	}); err != nil {
+		return err
+	}
+
+	// Return success
+	return nil
+}
+
+// Return the next segment of tokens
+func (context *context) NextSegment() (Segment, error) {
+	if context.model.ctx == nil {
+		return Segment{}, ErrInternalAppError
+	}
+	if context.n >= context.model.ctx.Whisper_full_n_segments() {
+		return Segment{}, io.EOF
+	}
+
+	// Populate result
+	result := toSegment(context.model.ctx, context.n)
+
+	// Increment the cursor
+	context.n++
+
+	// Return success
+	return result, nil
+}
+
+// Test for text tokens
+func (context *context) IsText(t Token) bool {
+	switch {
+	case context.IsBEG(t):
+		return false
+	case context.IsSOT(t):
+		return false
+	case whisper.Token(t.Id) >= context.model.ctx.Whisper_token_eot():
+		return false
+	case context.IsPREV(t):
+		return false
+	case context.IsSOLM(t):
+		return false
+	case context.IsNOT(t):
+		return false
+	default:
+		return true
+	}
+}
+
+// Test for "begin" token
+func (context *context) IsBEG(t Token) bool {
+	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_beg()
+}
+
+// Test for "start of transcription" token
+func (context *context) IsSOT(t Token) bool {
+	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_sot()
+}
+
+// Test for "end of transcription" token
+func (context *context) IsEOT(t Token) bool {
+	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_eot()
+}
+
+// Test for "start of prev" token
+func (context *context) IsPREV(t Token) bool {
+	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_prev()
+}
+
+// Test for "start of lm" token
+func (context *context) IsSOLM(t Token) bool {
+	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_solm()
+}
+
+// Test for "No timestamps" token
+func (context *context) IsNOT(t Token) bool {
+	return whisper.Token(t.Id) == context.model.ctx.Whisper_token_not()
+}
+
+// Test for token associated with a specific language
+func (context *context) IsLANG(t Token, lang string) bool {
+	if id := context.model.ctx.Whisper_lang_id(lang); id >= 0 {
+		return whisper.Token(t.Id) == context.model.ctx.Whisper_token_lang(id)
+	} else {
+		return false
+	}
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// PRIVATE METHODS
+
+func toSegment(ctx *whisper.Context, n int) Segment {
+	return Segment{
+		Num:    n,
+		Text:   strings.TrimSpace(ctx.Whisper_full_get_segment_text(n)),
+		Start:  time.Duration(ctx.Whisper_full_get_segment_t0(n)) * time.Millisecond * 10,
+		End:    time.Duration(ctx.Whisper_full_get_segment_t1(n)) * time.Millisecond * 10,
+		Tokens: toTokens(ctx, n),
+	}
+}
+
+func toTokens(ctx *whisper.Context, n int) []Token {
+	result := make([]Token, ctx.Whisper_full_n_tokens(n))
+	for i := 0; i < len(result); i++ {
+		result[i] = Token{
+			Id:   int(ctx.Whisper_full_get_token_id(n, i)),
+			Text: strings.TrimSpace(ctx.Whisper_full_get_token_text(n, i)),
+			P:    ctx.Whisper_full_get_token_p(n, i),
+		}
+	}
+	return result
+}
--- a/bindings/go/pkg/whisper/context_test.go
+++ b/bindings/go/pkg/whisper/context_test.go
@ -0,0 +1,55 @@
+package whisper_test
+
+import (
+	"os"
+	"testing"
+
+	// Packages
+	whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
+	assert "github.com/stretchr/testify/assert"
+)
+
+const (
+	ModelPath  = "../../models/ggml-tiny.bin"
+	SamplePath = "../../samples/jfk.wav"
+)
+
+func Test_Whisper_000(t *testing.T) {
+	assert := assert.New(t)
+	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
+		t.Skip("Skipping test, model not found:", ModelPath)
+	}
+	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
+		t.Skip("Skipping test, sample not found:", SamplePath)
+	}
+
+	// Load model
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	assert.NotNil(model)
+	assert.NoError(model.Close())
+
+	t.Log("languages=", model.Languages())
+}
+
+func Test_Whisper_001(t *testing.T) {
+	assert := assert.New(t)
+	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
+		t.Skip("Skipping test, model not found:", ModelPath)
+	}
+	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
+		t.Skip("Skipping test, sample not found:", SamplePath)
+	}
+
+	// Load model
+	model, err := whisper.New(ModelPath)
+	assert.NoError(err)
+	assert.NotNil(model)
+	defer model.Close()
+
+	// Get context for decoding
+	ctx, err := model.NewContext()
+	assert.NoError(err)
+	assert.NotNil(ctx)
+
+}
--- a/bindings/go/pkg/whisper/doc.go
+++ b/bindings/go/pkg/whisper/doc.go
@ -0,0 +1,4 @@
+/*
+This is the higher-level speech-to-text whisper.cpp API for go
+*/
+package whisper
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -0,0 +1,85 @@
+package whisper
+
+import (
+	"io"
+	"time"
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// TYPES
+
+// SegmentCallback is the callback function for processing segments in real
+// time. It is called during the Process function
+type SegmentCallback func(Segment)
+
+// Model is the interface to a whisper model. Create a new model with the
+// function whisper.New(string)
+type Model interface {
+	io.Closer
+
+	// Return a new speech-to-text context.
+	NewContext() (Context, error)
+
+	// Return true if the model is multilingual.
+	IsMultilingual() bool
+
+	// Return all languages supported.
+	Languages() []string
+}
+
+// Context is the speach recognition context.
+type Context interface {
+	SetLanguage(string) error // Set the language to use for speech recognition.
+	SetTranslate(bool)        // Set translate flag
+	IsMultilingual() bool     // Return true if the model is multilingual.
+	Language() string         // Get language
+
+	SetOffset(time.Duration)      // Set offset
+	SetDuration(time.Duration)    // Set duration
+	SetThreads(uint)              // Set number of threads to use
+	SetSpeedup(bool)              // Set speedup flag
+	SetTokenThreshold(float32)    // Set timestamp token probability threshold
+	SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
+	SetMaxSegmentLength(uint)     // Set max segment length in characters
+	SetMaxTokensPerSegment(uint)  // Set max tokens per segment (0 = no limit)
+
+	// Process mono audio data and return any errors.
+	// If defined, newly generated segments are passed to the
+	// callback function during processing.
+	Process([]float32, SegmentCallback) error
+
+	// After process is called, return segments until the end of the stream
+	// is reached, when io.EOF is returned.
+	NextSegment() (Segment, error)
+
+	IsBEG(Token) bool          // Test for "begin" token
+	IsSOT(Token) bool          // Test for "start of transcription" token
+	IsEOT(Token) bool          // Test for "end of transcription" token
+	IsPREV(Token) bool         // Test for "start of prev" token
+	IsSOLM(Token) bool         // Test for "start of lm" token
+	IsNOT(Token) bool          // Test for "No timestamps" token
+	IsLANG(Token, string) bool // Test for token associated with a specific language
+	IsText(Token) bool         // Test for text token
+}
+
+// Segment is the text result of a speech recognition.
+type Segment struct {
+	// Segment Number
+	Num int
+
+	// Time beginning and end timestamps for the segment.
+	Start, End time.Duration
+
+	// The text of the segment.
+	Text string
+
+	// The tokens of the segment.
+	Tokens []Token
+}
+
+// Token is a text or special token
+type Token struct {
+	Id   int
+	Text string
+	P    float32
+}
--- a/bindings/go/pkg/whisper/model.go
+++ b/bindings/go/pkg/whisper/model.go
@ -0,0 +1,100 @@
+package whisper
+
+import (
+	"fmt"
+	"os"
+	"runtime"
+
+	// Bindings
+	whisper "github.com/ggerganov/whisper.cpp/bindings/go"
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// TYPES
+
+type model struct {
+	path string
+	ctx  *whisper.Context
+}
+
+// Make sure model adheres to the interface
+var _ Model = (*model)(nil)
+
+///////////////////////////////////////////////////////////////////////////////
+// LIFECYCLE
+
+func New(path string) (Model, error) {
+	model := new(model)
+	if _, err := os.Stat(path); err != nil {
+		return nil, err
+	} else if ctx := whisper.Whisper_init(path); ctx == nil {
+		return nil, ErrUnableToLoadModel
+	} else {
+		model.ctx = ctx
+		model.path = path
+	}
+
+	// Return success
+	return model, nil
+}
+
+func (model *model) Close() error {
+	if model.ctx != nil {
+		model.ctx.Whisper_free()
+	}
+
+	// Release resources
+	model.ctx = nil
+
+	// Return success
+	return nil
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// STRINGIFY
+
+func (model *model) String() string {
+	str := "<whisper.model"
+	if model.ctx != nil {
+		str += fmt.Sprintf(" model=%q", model.path)
+	}
+	return str + ">"
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// PUBLIC METHODS
+
+// Return true if model is multilingual (language and translation options are supported)
+func (model *model) IsMultilingual() bool {
+	return model.ctx.Whisper_is_multilingual() != 0
+}
+
+// Return all recognized languages. Initially it is set to auto-detect
+func (model *model) Languages() []string {
+	result := make([]string, 0, whisper.Whisper_lang_max_id())
+	for i := 0; i < whisper.Whisper_lang_max_id(); i++ {
+		str := whisper.Whisper_lang_str(i)
+		if model.ctx.Whisper_lang_id(str) >= 0 {
+			result = append(result, str)
+		}
+	}
+	return result
+}
+
+func (model *model) NewContext() (Context, error) {
+	if model.ctx == nil {
+		return nil, ErrInternalAppError
+	}
+
+	// Create new context
+	params := model.ctx.Whisper_full_default_params(whisper.SAMPLING_GREEDY)
+	params.SetTranslate(false)
+	params.SetPrintSpecial(false)
+	params.SetPrintProgress(false)
+	params.SetPrintRealtime(false)
+	params.SetPrintTimestamps(false)
+	params.SetThreads(runtime.NumCPU())
+
+	// Return new context
+	return newContext(model, params)
+}
--- a/bindings/go/samples/jfk.wav
+++ b/bindings/go/samples/jfk.wav
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -0,0 +1,409 @@
+package whisper
+
+import (
+	"errors"
+	"unsafe"
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// CGO
+
+/*
+#cgo LDFLAGS: -lwhisper -lm -lstdc++
+#cgo darwin LDFLAGS: -framework Accelerate
+#include <whisper.h>
+#include <stdlib.h>
+
+extern void callNewSegment(void* user_data, int new);
+extern bool callEncoderBegin(void* user_data);
+
+// Text segment callback
+// Called on every newly generated text segment
+// Use the whisper_full_...() functions to obtain the text segments
+static void whisper_new_segment_cb(struct whisper_context* ctx, int n_new, void* user_data) {
+    if(user_data != NULL && ctx != NULL) {
+        callNewSegment(user_data, n_new);
+    }
+}
+
+// Encoder begin callback
+// If not NULL, called before the encoder starts
+// If it returns false, the computation is aborted
+static bool whisper_encoder_begin_cb(struct whisper_context* ctx, void* user_data) {
+    if(user_data != NULL && ctx != NULL) {
+        return callEncoderBegin(user_data);
+    }
+    return false;
+}
+
+// Get default parameters and set callbacks
+static struct whisper_full_params whisper_full_default_params_cb(struct whisper_context* ctx, enum whisper_sampling_strategy strategy) {
+	struct whisper_full_params params = whisper_full_default_params(strategy);
+	params.new_segment_callback = whisper_new_segment_cb;
+	params.new_segment_callback_user_data = (void*)(ctx);
+	params.encoder_begin_callback = whisper_encoder_begin_cb;
+	params.encoder_begin_callback_user_data = (void*)(ctx);
+	return params;
+}
+*/
+import "C"
+
+///////////////////////////////////////////////////////////////////////////////
+// TYPES
+
+type (
+	Context          C.struct_whisper_context
+	Token            C.whisper_token
+	TokenData        C.struct_whisper_token_data
+	SamplingStrategy C.enum_whisper_sampling_strategy
+	Params           C.struct_whisper_full_params
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// GLOBALS
+
+const (
+	SAMPLING_GREEDY      SamplingStrategy = C.WHISPER_SAMPLING_GREEDY
+	SAMPLING_BEAM_SEARCH SamplingStrategy = C.WHISPER_SAMPLING_BEAM_SEARCH
+)
+
+const (
+	SampleRate = C.WHISPER_SAMPLE_RATE                 // Expected sample rate, samples per second
+	SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
+	NumFFT     = C.WHISPER_N_FFT
+	NumMEL     = C.WHISPER_N_MEL
+	HopLength  = C.WHISPER_HOP_LENGTH
+	ChunkSize  = C.WHISPER_CHUNK_SIZE
+)
+
+var (
+	ErrTokenizerFailed  = errors.New("whisper_tokenize failed")
+	ErrAutoDetectFailed = errors.New("whisper_lang_auto_detect failed")
+	ErrConversionFailed = errors.New("whisper_convert failed")
+	ErrInvalidLanguage  = errors.New("invalid language")
+)
+
+///////////////////////////////////////////////////////////////////////////////
+// PUBLIC METHODS
+
+// Allocates all memory needed for the model and loads the model from the given file.
+// Returns NULL on failure.
+func Whisper_init(path string) *Context {
+	cPath := C.CString(path)
+	defer C.free(unsafe.Pointer(cPath))
+	if ctx := C.whisper_init_from_file(cPath); ctx != nil {
+		return (*Context)(ctx)
+	} else {
+		return nil
+	}
+}
+
+// Frees all memory allocated by the model.
+func (ctx *Context) Whisper_free() {
+	C.whisper_free((*C.struct_whisper_context)(ctx))
+}
+
+// Convert RAW PCM audio to log mel spectrogram.
+// The resulting spectrogram is stored inside the provided whisper context.
+func (ctx *Context) Whisper_pcm_to_mel(data []float32, threads int) error {
+	if C.whisper_pcm_to_mel((*C.struct_whisper_context)(ctx), (*C.float)(&data[0]), C.int(len(data)), C.int(threads)) == 0 {
+		return nil
+	} else {
+		return ErrConversionFailed
+	}
+}
+
+// This can be used to set a custom log mel spectrogram inside the provided whisper context.
+// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
+// n_mel must be 80
+func (ctx *Context) Whisper_set_mel(data []float32, n_mel int) error {
+	if C.whisper_set_mel((*C.struct_whisper_context)(ctx), (*C.float)(&data[0]), C.int(len(data)), C.int(n_mel)) == 0 {
+		return nil
+	} else {
+		return ErrConversionFailed
+	}
+}
+
+// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
+// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
+// offset can be used to specify the offset of the first frame in the spectrogram.
+func (ctx *Context) Whisper_encode(offset, threads int) error {
+	if C.whisper_encode((*C.struct_whisper_context)(ctx), C.int(offset), C.int(threads)) == 0 {
+		return nil
+	} else {
+		return ErrConversionFailed
+	}
+}
+
+// Run the Whisper decoder to obtain the logits and probabilities for the next token.
+// Make sure to call whisper_encode() first.
+// tokens + n_tokens is the provided context for the decoder.
+// n_past is the number of tokens to use from previous decoder calls.
+func (ctx *Context) Whisper_decode(tokens []Token, past, threads int) error {
+	if C.whisper_decode((*C.struct_whisper_context)(ctx), (*C.whisper_token)(&tokens[0]), C.int(len(tokens)), C.int(past), C.int(threads)) == 0 {
+		return nil
+	} else {
+		return ErrConversionFailed
+	}
+}
+
+// Convert the provided text into tokens. The tokens pointer must be large enough to hold the resulting tokens.
+// Returns the number of tokens on success
+func (ctx *Context) Whisper_tokenize(text string, tokens []Token) (int, error) {
+	cText := C.CString(text)
+	defer C.free(unsafe.Pointer(cText))
+	if n := C.whisper_tokenize((*C.struct_whisper_context)(ctx), cText, (*C.whisper_token)(&tokens[0]), C.int(len(tokens))); n >= 0 {
+		return int(n), nil
+	} else {
+		return 0, ErrTokenizerFailed
+	}
+}
+
+// Return the id of the specified language, returns -1 if not found
+// Examples:
+//
+//	"de" -> 2
+//	"german" -> 2
+func (ctx *Context) Whisper_lang_id(lang string) int {
+	return int(C.whisper_lang_id(C.CString(lang)))
+}
+
+// Largest language id (i.e. number of available languages - 1)
+func Whisper_lang_max_id() int {
+	return int(C.whisper_lang_max_id())
+}
+
+// Return the short string of the specified language id (e.g. 2 -> "de"),
+// returns empty string if not found
+func Whisper_lang_str(id int) string {
+	return C.GoString(C.whisper_lang_str(C.int(id)))
+}
+
+// Use mel data at offset_ms to try and auto-detect the spoken language
+// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
+// Returns the probabilities of all languages.
+// ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69
+func (ctx *Context) Whisper_lang_auto_detect(offset_ms, n_threads int) ([]float32, error) {
+	probs := make([]float32, Whisper_lang_max_id()+1)
+	if n := int(C.whisper_lang_auto_detect((*C.struct_whisper_context)(ctx), C.int(offset_ms), C.int(n_threads), (*C.float)(&probs[0]))); n < 0 {
+		return nil, ErrAutoDetectFailed
+	} else {
+		return probs, nil
+	}
+}
+
+func (ctx *Context) Whisper_n_len() int {
+	return int(C.whisper_n_len((*C.struct_whisper_context)(ctx)))
+}
+
+func (ctx *Context) Whisper_n_vocab() int {
+	return int(C.whisper_n_vocab((*C.struct_whisper_context)(ctx)))
+}
+
+func (ctx *Context) Whisper_n_text_ctx() int {
+	return int(C.whisper_n_text_ctx((*C.struct_whisper_context)(ctx)))
+}
+
+func (ctx *Context) Whisper_n_audio_ctx() int {
+	return int(C.whisper_n_audio_ctx((*C.struct_whisper_context)(ctx)))
+}
+
+func (ctx *Context) Whisper_is_multilingual() int {
+	return int(C.whisper_is_multilingual((*C.struct_whisper_context)(ctx)))
+}
+
+// The probabilities for the next token
+//func (ctx *Whisper_context) Whisper_get_probs() []float32 {
+//	return (*[1 << 30]float32)(unsafe.Pointer(C.whisper_get_probs((*C.struct_whisper_context)(ctx))))[:ctx.Whisper_n_vocab()]
+//}
+
+// Token Id -> String. Uses the vocabulary in the provided context
+func (ctx *Context) Whisper_token_to_str(token Token) string {
+	return C.GoString(C.whisper_token_to_str((*C.struct_whisper_context)(ctx), C.whisper_token(token)))
+}
+
+// Special tokens
+func (ctx *Context) Whisper_token_eot() Token {
+	return Token(C.whisper_token_eot((*C.struct_whisper_context)(ctx)))
+}
+
+// Special tokens
+func (ctx *Context) Whisper_token_sot() Token {
+	return Token(C.whisper_token_sot((*C.struct_whisper_context)(ctx)))
+}
+
+// Special tokens
+func (ctx *Context) Whisper_token_prev() Token {
+	return Token(C.whisper_token_prev((*C.struct_whisper_context)(ctx)))
+}
+
+// Special tokens
+func (ctx *Context) Whisper_token_solm() Token {
+	return Token(C.whisper_token_solm((*C.struct_whisper_context)(ctx)))
+}
+
+// Special tokens
+func (ctx *Context) Whisper_token_not() Token {
+	return Token(C.whisper_token_not((*C.struct_whisper_context)(ctx)))
+}
+
+// Special tokens
+func (ctx *Context) Whisper_token_beg() Token {
+	return Token(C.whisper_token_beg((*C.struct_whisper_context)(ctx)))
+}
+
+// Special tokens
+func (ctx *Context) Whisper_token_lang(lang_id int) Token {
+	return Token(C.whisper_token_lang((*C.struct_whisper_context)(ctx), C.int(lang_id)))
+}
+
+// Task tokens
+func Whisper_token_translate() Token {
+	return Token(C.whisper_token_translate())
+}
+
+// Task tokens
+func Whisper_token_transcribe() Token {
+	return Token(C.whisper_token_transcribe())
+}
+
+// Performance information
+func (ctx *Context) Whisper_print_timings() {
+	C.whisper_print_timings((*C.struct_whisper_context)(ctx))
+}
+
+// Performance information
+func (ctx *Context) Whisper_reset_timings() {
+	C.whisper_reset_timings((*C.struct_whisper_context)(ctx))
+}
+
+// Print system information
+func Whisper_print_system_info() string {
+	return C.GoString(C.whisper_print_system_info())
+}
+
+// Return default parameters for a strategy
+func (ctx *Context) Whisper_full_default_params(strategy SamplingStrategy) Params {
+	// Get default parameters
+	return Params(C.whisper_full_default_params_cb((*C.struct_whisper_context)(ctx), C.enum_whisper_sampling_strategy(strategy)))
+}
+
+// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
+// Uses the specified decoding strategy to obtain the text.
+func (ctx *Context) Whisper_full(params Params, samples []float32, encoderBeginCallback func() bool, newSegmentCallback func(int)) error {
+	registerEncoderBeginCallback(ctx, encoderBeginCallback)
+	registerNewSegmentCallback(ctx, newSegmentCallback)
+	defer registerEncoderBeginCallback(ctx, nil)
+	defer registerNewSegmentCallback(ctx, nil)
+	if C.whisper_full((*C.struct_whisper_context)(ctx), (C.struct_whisper_full_params)(params), (*C.float)(&samples[0]), C.int(len(samples))) == 0 {
+		return nil
+	} else {
+		return ErrConversionFailed
+	}
+}
+
+// Split the input audio in chunks and process each chunk separately using whisper_full()
+// It seems this approach can offer some speedup in some cases.
+// However, the transcription accuracy can be worse at the beginning and end of each chunk.
+func (ctx *Context) Whisper_full_parallel(params Params, samples []float32, processors int, encoderBeginCallback func() bool, newSegmentCallback func(int)) error {
+	registerEncoderBeginCallback(ctx, encoderBeginCallback)
+	registerNewSegmentCallback(ctx, newSegmentCallback)
+	defer registerEncoderBeginCallback(ctx, nil)
+	defer registerNewSegmentCallback(ctx, nil)
+
+	if C.whisper_full_parallel((*C.struct_whisper_context)(ctx), (C.struct_whisper_full_params)(params), (*C.float)(&samples[0]), C.int(len(samples)), C.int(processors)) == 0 {
+		return nil
+	} else {
+		return ErrConversionFailed
+	}
+}
+
+// Number of generated text segments.
+// A segment can be a few words, a sentence, or even a paragraph.
+func (ctx *Context) Whisper_full_n_segments() int {
+	return int(C.whisper_full_n_segments((*C.struct_whisper_context)(ctx)))
+}
+
+// Get the start and end time of the specified segment.
+func (ctx *Context) Whisper_full_get_segment_t0(segment int) int64 {
+	return int64(C.whisper_full_get_segment_t0((*C.struct_whisper_context)(ctx), C.int(segment)))
+}
+
+// Get the start and end time of the specified segment.
+func (ctx *Context) Whisper_full_get_segment_t1(segment int) int64 {
+	return int64(C.whisper_full_get_segment_t1((*C.struct_whisper_context)(ctx), C.int(segment)))
+}
+
+// Get the text of the specified segment.
+func (ctx *Context) Whisper_full_get_segment_text(segment int) string {
+	return C.GoString(C.whisper_full_get_segment_text((*C.struct_whisper_context)(ctx), C.int(segment)))
+}
+
+// Get number of tokens in the specified segment.
+func (ctx *Context) Whisper_full_n_tokens(segment int) int {
+	return int(C.whisper_full_n_tokens((*C.struct_whisper_context)(ctx), C.int(segment)))
+}
+
+// Get the token text of the specified token index in the specified segment.
+func (ctx *Context) Whisper_full_get_token_text(segment int, token int) string {
+	return C.GoString(C.whisper_full_get_token_text((*C.struct_whisper_context)(ctx), C.int(segment), C.int(token)))
+}
+
+// Get the token of the specified token index in the specified segment.
+func (ctx *Context) Whisper_full_get_token_id(segment int, token int) Token {
+	return Token(C.whisper_full_get_token_id((*C.struct_whisper_context)(ctx), C.int(segment), C.int(token)))
+}
+
+// Get token data for the specified token in the specified segment.
+// This contains probabilities, timestamps, etc.
+func (ctx *Context) whisper_full_get_token_data(segment int, token int) TokenData {
+	return TokenData(C.whisper_full_get_token_data((*C.struct_whisper_context)(ctx), C.int(segment), C.int(token)))
+}
+
+// Get the probability of the specified token in the specified segment.
+func (ctx *Context) Whisper_full_get_token_p(segment int, token int) float32 {
+	return float32(C.whisper_full_get_token_p((*C.struct_whisper_context)(ctx), C.int(segment), C.int(token)))
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// CALLBACKS
+
+var (
+	cbNewSegment   = make(map[unsafe.Pointer]func(int))
+	cbEncoderBegin = make(map[unsafe.Pointer]func() bool)
+)
+
+func registerNewSegmentCallback(ctx *Context, fn func(int)) {
+	if fn == nil {
+		delete(cbNewSegment, unsafe.Pointer(ctx))
+	} else {
+		cbNewSegment[unsafe.Pointer(ctx)] = fn
+	}
+}
+
+func registerEncoderBeginCallback(ctx *Context, fn func() bool) {
+	if fn == nil {
+		delete(cbEncoderBegin, unsafe.Pointer(ctx))
+	} else {
+		cbEncoderBegin[unsafe.Pointer(ctx)] = fn
+	}
+}
+
+//export callNewSegment
+func callNewSegment(user_data unsafe.Pointer, new C.int) {
+	if fn, ok := cbNewSegment[user_data]; ok {
+		fn(int(new))
+	}
+}
+
+//export callEncoderBegin
+func callEncoderBegin(user_data unsafe.Pointer) C.bool {
+	if fn, ok := cbEncoderBegin[user_data]; ok {
+		if fn() {
+			return C.bool(true)
+		} else {
+			return C.bool(false)
+		}
+	}
+	return true
+}
--- a/bindings/go/whisper_test.go
+++ b/bindings/go/whisper_test.go
@ -0,0 +1,113 @@
+package whisper_test
+
+import (
+	"os"
+	"runtime"
+	"testing"
+	"time"
+
+	// Packages
+	whisper "github.com/ggerganov/whisper.cpp/bindings/go"
+	wav "github.com/go-audio/wav"
+	assert "github.com/stretchr/testify/assert"
+)
+
+const (
+	ModelPath  = "models/ggml-small.en.bin"
+	SamplePath = "samples/jfk.wav"
+)
+
+func Test_Whisper_000(t *testing.T) {
+	assert := assert.New(t)
+	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
+		t.Skip("Skipping test, model not found:", ModelPath)
+	}
+	ctx := whisper.Whisper_init(ModelPath)
+	assert.NotNil(ctx)
+	ctx.Whisper_free()
+}
+
+func Test_Whisper_001(t *testing.T) {
+	assert := assert.New(t)
+	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
+		t.Skip("Skipping test, model not found:", ModelPath)
+	}
+	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
+		t.Skip("Skipping test, sample not found:", SamplePath)
+	}
+
+	// Open samples
+	fh, err := os.Open(SamplePath)
+	assert.NoError(err)
+	defer fh.Close()
+
+	// Read samples
+	d := wav.NewDecoder(fh)
+	buf, err := d.FullPCMBuffer()
+	assert.NoError(err)
+
+	// Run whisper
+	ctx := whisper.Whisper_init(ModelPath)
+	assert.NotNil(ctx)
+	defer ctx.Whisper_free()
+	params := ctx.Whisper_full_default_params(whisper.SAMPLING_GREEDY)
+	data := buf.AsFloat32Buffer().Data
+	err = ctx.Whisper_full(params, data, nil, nil)
+	assert.NoError(err)
+
+	// Print out tokens
+	num_segments := ctx.Whisper_full_n_segments()
+	assert.GreaterOrEqual(num_segments, 1)
+	for i := 0; i < num_segments; i++ {
+		str := ctx.Whisper_full_get_segment_text(i)
+		assert.NotEmpty(str)
+		t0 := time.Duration(ctx.Whisper_full_get_segment_t0(i)) * time.Millisecond
+		t1 := time.Duration(ctx.Whisper_full_get_segment_t1(i)) * time.Millisecond
+		t.Logf("[%6s->%-6s] %q", t0, t1, str)
+	}
+}
+
+func Test_Whisper_002(t *testing.T) {
+	assert := assert.New(t)
+	for i := 0; i < whisper.Whisper_lang_max_id(); i++ {
+		str := whisper.Whisper_lang_str(i)
+		assert.NotEmpty(str)
+		t.Log(str)
+	}
+}
+
+func Test_Whisper_003(t *testing.T) {
+	threads := runtime.NumCPU()
+	assert := assert.New(t)
+	if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
+		t.Skip("Skipping test, model not found:", ModelPath)
+	}
+	if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
+		t.Skip("Skipping test, sample not found:", SamplePath)
+	}
+
+	// Open samples
+	fh, err := os.Open(SamplePath)
+	assert.NoError(err)
+	defer fh.Close()
+
+	// Read samples
+	d := wav.NewDecoder(fh)
+	buf, err := d.FullPCMBuffer()
+	assert.NoError(err)
+
+	// Make the model
+	ctx := whisper.Whisper_init(ModelPath)
+	assert.NotNil(ctx)
+	defer ctx.Whisper_free()
+
+	// Get MEL
+	assert.NoError(ctx.Whisper_pcm_to_mel(buf.AsFloat32Buffer().Data, threads))
+
+	// Get Languages
+	languages, err := ctx.Whisper_lang_auto_detect(0, threads)
+	assert.NoError(err)
+	for i, p := range languages {
+		t.Logf("%s: %f", whisper.Whisper_lang_str(i), p)
+	}
+}
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/emscripten.cpp
+++ b/bindings/javascript/emscripten.cpp
@ -20,7 +20,7 @@ struct whisper_context * g_context;
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        if (g_context == nullptr) {
-            g_context = whisper_init(path_model.c_str());
+            g_context = whisper_init_from_file(path_model.c_str());
            if (g_context != nullptr) {
                return true;
            } else {
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.0.3",
+  "version": "1.1.0",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/cmake/DefaultTargetOptions.cmake
+++ b/cmake/DefaultTargetOptions.cmake
@ -0,0 +1,17 @@
+# Set the default compile features and properties for a target.
+
+if (NOT TARGET)
+    message(FATAL_ERROR "TARGET not set before including DefaultTargetOptions")
+endif()
+
+target_compile_features(${TARGET}
+    PRIVATE
+        cxx_std_11
+    )
+
+set_target_properties(${TARGET}
+    PROPERTIES
+        EXPORT_COMPILE_COMMANDS ON
+        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
+        INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib"
+)
--- a/examples/bench.wasm/CMakeLists.txt
+++ b/examples/bench.wasm/CMakeLists.txt
@ -8,6 +8,8 @@ add_executable(${TARGET}
    emscripten.cpp
    )

+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
--- a/examples/bench.wasm/emscripten.cpp
+++ b/examples/bench.wasm/emscripten.cpp
@ -28,6 +28,11 @@ void bench_main(size_t index) {
        return;
    }

+    {
+        fprintf(stderr, "\n");
+        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
+    }
+
    if (int ret = whisper_encode(ctx, 0, n_threads) != 0) {
        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return;
@ -52,7 +57,7 @@ EMSCRIPTEN_BINDINGS(bench) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    if (g_worker.joinable()) {
                        g_worker.join();
--- a/examples/bench/CMakeLists.txt
+++ b/examples/bench/CMakeLists.txt
@ -1,3 +1,6 @@
 set(TARGET bench)
 add_executable(${TARGET} bench.cpp)
+
+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -7,6 +7,7 @@
 // command-line parameters
 struct whisper_params {
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t what = 0; // what to benchmark: 0 - whisper ecoder, 1 - memcpy, 2 - ggml_mul_mat

    std::string model = "models/ggml-base.en.bin";
 };
@ -23,6 +24,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        }
        else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
        else if (arg == "-m" || arg == "--model")   { params.model     = argv[++i]; }
+        else if (arg == "-w" || arg == "--what")    { params.what     = atoi(argv[++i]); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -33,7 +35,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    return true;
 }

-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
@ -41,19 +43,17 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "  -h,       --help        [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N   [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -m FNAME, --model FNAME [%-7s] model path\n",                                  params.model.c_str());
+    fprintf(stderr, "  -w N,     --what N      [%-7d] what to benchmark:\n",                          params.what);
+    fprintf(stderr, "                           %-7s  0 - whisper encoder\n",                         "");
+    fprintf(stderr, "                           %-7s  1 - memcpy\n",                                  "");
+    fprintf(stderr, "                           %-7s  2 - ggml_mul_mat\n",                            "");
    fprintf(stderr, "\n");
 }

-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
+int whisper_bench_encoder(const whisper_params & params) {
    // whisper init

-    struct whisper_context * ctx = whisper_init(params.model.c_str());
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());

    {
        fprintf(stderr, "\n");
@ -92,3 +92,22 @@ int main(int argc, char ** argv) {

    return 0;
 }
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    int ret = -1;
+
+    switch (params.what) {
+        case 0: ret = whisper_bench_encoder(params);                break;
+        case 1: ret = whisper_bench_memcpy(params.n_threads);       break;
+        case 2: ret = whisper_bench_ggml_mul_mat(params.n_threads); break;
+        default: fprintf(stderr, "error: unknown benchmark: %d\n", params.what); break;
+    }
+
+    return ret;
+}
--- a/examples/command.wasm/CMakeLists.txt
+++ b/examples/command.wasm/CMakeLists.txt
@ -8,6 +8,8 @@ add_executable(${TARGET}
    emscripten.cpp
    )

+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
--- a/examples/command.wasm/emscripten.cpp
+++ b/examples/command.wasm/emscripten.cpp
@ -324,7 +324,7 @@ EMSCRIPTEN_BINDINGS(command) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@ -2,6 +2,9 @@ if (WHISPER_SUPPORT_SDL2)
    # command
    set(TARGET command)
    add_executable(${TARGET} command.cpp)
+
+    include(DefaultTargetOptions)
+
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/command/README.md
+++ b/examples/command/README.md
@ -8,13 +8,30 @@ More info is available in [issue #171](https://github.com/ggerganov/whisper.cpp/
 ./command -m ./models/ggml-small.en.bin -t 8

 # On Raspberry Pi, use tiny or base models + "-ac 768" for better performance
-./command -m ./models/ggml-tiny.en.bin -ac 768 -t 4 -c 0
+./command -m ./models/ggml-tiny.en.bin -ac 768 -t 3 -c 0
 ```

 https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a76d-5735c25c49da.mp4

 Web version: [examples/command.wasm](/examples/command.wasm)

+## Guided mode
+
+"Guided mode" allows you to specify a list of commands (i.e. strings) and the transcription will be guided to classify your command into one from the list. This can be useful in situations where a device is listening only for a small subset of commands.
+
+Initial tests show that this approach might be extremely efficient in terms of performance, since it integrates very well with the "partial Encoder" idea from #137.
+
+```bash
+# Run in guided mode, the list of allowed commands is in commands.txt
+./command -m ./models/ggml-base.en.bin -cmd ./examples/command/commands.txt
+
+# On Raspberry Pi, in guided mode you can use "-ac 128" for extra performance
+./command -m ./models/ggml-tiny.en.bin -cmd ./examples/command/commands.txt -ac 128 -t 3 -c 0
+```
+
+https://user-images.githubusercontent.com/1991296/207435352-8fc4ed3f-bde5-4555-9b8b-aeeb76bee969.mp4
+
+
 ## Building

 The `command` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -11,6 +11,7 @@
 #include <SDL.h>
 #include <SDL_audio.h>

+#include <sstream>
 #include <cassert>
 #include <cstdio>
 #include <fstream>
@ -19,12 +20,13 @@
 #include <string>
 #include <thread>
 #include <vector>
+#include <map>

 // command-line parameters
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t prompt_ms  = 5000;
-    int32_t command_ms = 4000;
+    int32_t command_ms = 8000;
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
@ -40,7 +42,9 @@ struct whisper_params {

    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
-    std::string fname_out = "";
+    std::string fname_out;
+    std::string commands;
+    std::string prompt;
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -68,6 +72,8 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
+        else if (arg == "-cmd" || arg == "--commands")      { params.commands      = argv[++i]; }
+        else if (arg == "-p"   || arg == "--prompt")        { params.prompt        = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -78,27 +84,29 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    return true;
 }

-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -pms N,   --prompt-ms N   [%-7d] prompt duration in milliseconds\n",             params.prompt_ms);
-    fprintf(stderr, "  -cms N,   --command-ms N  [%-7d] command duration in milliseconds\n",            params.command_ms);
-    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
-    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
-    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
-    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
-    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
+    fprintf(stderr, "  -h,         --help           [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,       --threads N      [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "  -pms N,     --prompt-ms N    [%-7d] prompt duration in milliseconds\n",             params.prompt_ms);
+    fprintf(stderr, "  -cms N,     --command-ms N   [%-7d] command duration in milliseconds\n",            params.command_ms);
+    fprintf(stderr, "  -c ID,      --capture ID     [%-7d] capture device ID\n",                           params.capture_id);
+    fprintf(stderr, "  -mt N,      --max-tokens N   [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
+    fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
+    fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
+    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
+    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
+    fprintf(stderr, "  -l LANG,    --language LANG  [%-7s] spoken language\n",                             params.language.c_str());
+    fprintf(stderr, "  -m FNAME,   --model FNAME    [%-7s] model path\n",                                  params.model.c_str());
+    fprintf(stderr, "  -f FNAME,   --file FNAME     [%-7s] text output file name\n",                       params.fname_out.c_str());
+    fprintf(stderr, "  -cmd FNAME, --commands FNAME [%-7s] text file with allowed commands\n",             params.commands.c_str());
+    fprintf(stderr, "  -p,         --prompt         [%-7s] the required activation prompt\n",              params.prompt.c_str());
    fprintf(stderr, "\n");
 }

@ -383,7 +391,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
    float energy_all  = 0.0f;
    float energy_last = 0.0f;

-    for (size_t i = 0; i < n_samples; i++) {
+    for (int i = 0; i < n_samples; i++) {
        energy_all += fabsf(pcmf32[i]);

        if (i >= n_samples - n_samples_last) {
@ -484,54 +492,350 @@ float similarity(const std::string & s0, const std::string & s1) {
    return 1.0f - (dist / std::max(s0.size(), s1.size()));
 }

-int main(int argc, char ** argv) {
-    whisper_params params;
+std::vector<std::string> read_allowed_commands(const std::string & fname) {
+    std::vector<std::string> allowed_commands;

-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
+    std::ifstream ifs(fname);
+    if (!ifs.is_open()) {
+        return allowed_commands;
    }

-    if (whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params);
-        exit(0);
+    std::string line;
+    while (std::getline(ifs, line)) {
+        line = trim(line);
+        if (line.empty()) {
+            continue;
+        }
+
+        std::transform(line.begin(), line.end(),line.begin(), ::tolower);
+        allowed_commands.push_back(std::move(line));
    }

-    // whisper init
+    return allowed_commands;
+}

-    struct whisper_context * ctx = whisper_init(params.model.c_str());
+std::vector<std::string> get_words(const std::string &txt) {
+    std::vector<std::string> words;

-    // print some info about the processing
-    {
-        fprintf(stderr, "\n");
-        if (!whisper_is_multilingual(ctx)) {
-            if (params.language != "en" || params.translate) {
-                params.language = "en";
-                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+    std::istringstream iss(txt);
+    std::string word;
+    while (iss >> word) {
+        words.push_back(word);
+    }
+
+    return words;
+}
+
+// returns true if no exit event was received
+bool process_sdl_events() {
+    SDL_Event event;
+    while (SDL_PollEvent(&event)) {
+        switch (event.type) {
+            case SDL_QUIT:
+                {
+                    return false;
+                } break;
+            default:
+                break;
+        }
+    }
+
+    return true;
+}
+
+// command-list mode
+// guide the transcription to match the most likely command from a provided list
+int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "%s: guided mode\n", __func__);
+
+    std::vector<std::string> allowed_commands = read_allowed_commands(params.commands);
+
+    if (allowed_commands.empty()) {
+        fprintf(stderr, "%s: error: failed to read allowed commands from '%s'\n", __func__, params.commands.c_str());
+        return 2;
+    }
+
+    int max_len = 0;
+
+    std::vector<std::vector<whisper_token>> allowed_tokens;
+
+    for (const auto & cmd : allowed_commands) {
+        whisper_token tokens[1024];
+        allowed_tokens.emplace_back();
+
+        for (int l = 0; l < (int) cmd.size(); ++l) {
+            // NOTE: very important to add the whitespace !
+            //       the reason is that the first decoded token starts with a whitespace too!
+            std::string ss = std::string(" ") + cmd.substr(0, l + 1);
+
+            const int n = whisper_tokenize(ctx, ss.c_str(), tokens, 1024);
+            if (n < 0) {
+                fprintf(stderr, "%s: error: failed to tokenize command '%s'\n", __func__, cmd.c_str());
+                return 3;
+            }
+
+            if (n == 1) {
+                allowed_tokens.back().push_back(tokens[0]);
            }
        }
-        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
-                __func__,
-                params.n_threads,
-                params.language.c_str(),
-                params.translate ? "translate" : "transcribe",
-                params.no_timestamps ? 0 : 1);

-        fprintf(stderr, "\n");
+        max_len = std::max(max_len, (int) cmd.size());
    }

-
-    // init audio
-
-    audio_async audio(30*1000);
-    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
-        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
-        return 1;
+    fprintf(stderr, "%s: allowed commands [ tokens ]:\n", __func__);
+    fprintf(stderr, "\n");
+    for (int i = 0; i < (int) allowed_commands.size(); ++i) {
+        fprintf(stderr, "  - \033[1m%-*s\033[0m = [", max_len, allowed_commands[i].c_str());
+        for (const auto & token : allowed_tokens[i]) {
+            fprintf(stderr, " %5d", token);
+        }
+        fprintf(stderr, " ]\n");
    }

-    audio.resume();
+    std::string  k_prompt = "select one from the available words: ";
+    for (int i = 0; i < (int) allowed_commands.size(); ++i) {
+        if (i > 0) {
+            k_prompt += ", ";
+        }
+        k_prompt += allowed_commands[i];
+    }
+    k_prompt += ". selected word: ";

+    // tokenize prompt
+    std::vector<whisper_token> k_tokens;
+    {
+        k_tokens.resize(1024);
+        const int n = whisper_tokenize(ctx, k_prompt.c_str(), k_tokens.data(), 1024);
+        if (n < 0) {
+            fprintf(stderr, "%s: error: failed to tokenize prompt '%s'\n", __func__, k_prompt.c_str());
+            return 4;
+        }
+        k_tokens.resize(n);
+    }
+
+    fprintf(stderr, "\n");
+    fprintf(stderr, "%s: prompt: '%s'\n", __func__, k_prompt.c_str());
+    fprintf(stderr, "%s: tokens: [", __func__);
+    for (const auto & token : k_tokens) {
+        fprintf(stderr, " %d", token);
+    }
+    fprintf(stderr, " ]\n");
+
+    fprintf(stderr, "\n");
+    fprintf(stderr, "%s: listening for a command ...\n", __func__);
+    fprintf(stderr, "\n");
+
+    bool is_running  = true;
+
+    std::vector<float> pcmf32_cur;
+    std::vector<float> pcmf32_prompt;
+
+    // main loop
+    while (is_running) {
+        // handle Ctrl + C
+        is_running = process_sdl_events();
+
+        // delay
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        audio.get(2000, pcmf32_cur);
+
+        if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+            fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
+
+            const auto t_start = std::chrono::high_resolution_clock::now();
+
+            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+            wparams.print_progress   = false;
+            wparams.print_special    = params.print_special;
+            wparams.print_realtime   = false;
+            wparams.print_timestamps = !params.no_timestamps;
+            wparams.translate        = params.translate;
+            wparams.no_context       = true;
+            wparams.single_segment   = true;
+            wparams.max_tokens       = 1;
+            wparams.language         = params.language.c_str();
+            wparams.n_threads        = params.n_threads;
+
+            wparams.audio_ctx        = params.audio_ctx;
+            wparams.speed_up         = params.speed_up;
+
+            wparams.prompt_tokens    = k_tokens.data();
+            wparams.prompt_n_tokens  = k_tokens.size();
+
+            // run the transformer and a single decoding pass
+            if (whisper_full(ctx, wparams, pcmf32_cur.data(), pcmf32_cur.size()) != 0) {
+                fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
+                break;
+            }
+
+            // estimate command probability
+            // NOTE: not optimal
+            {
+                const auto * logits = whisper_get_logits(ctx);
+
+                std::vector<float> probs(whisper_n_vocab(ctx), 0.0f);
+
+                // compute probs from logits via softmax
+                {
+                    float max = -1e9;
+                    for (int i = 0; i < (int) probs.size(); ++i) {
+                        max = std::max(max, logits[i]);
+                    }
+
+                    float sum = 0.0f;
+                    for (int i = 0; i < (int) probs.size(); ++i) {
+                        probs[i] = expf(logits[i] - max);
+                        sum += probs[i];
+                    }
+
+                    for (int i = 0; i < (int) probs.size(); ++i) {
+                        probs[i] /= sum;
+                    }
+                }
+
+                std::vector<std::pair<float, int>> probs_id;
+
+                double psum = 0.0;
+                for (int i = 0; i < (int) allowed_commands.size(); ++i) {
+                    probs_id.emplace_back(probs[allowed_tokens[i][0]], i);
+                    for (int j = 1; j < (int) allowed_tokens[i].size(); ++j) {
+                        probs_id.back().first += probs[allowed_tokens[i][j]];
+                    }
+                    probs_id.back().first /= allowed_tokens[i].size();
+                    psum += probs_id.back().first;
+                }
+
+                // normalize
+                for (auto & p : probs_id) {
+                    p.first /= psum;
+                }
+
+                // sort descending
+                {
+                    using pair_type = decltype(probs_id)::value_type;
+                    std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
+                        return a.first > b.first;
+                    });
+                }
+
+                // print the commands and the respective probabilities
+                {
+                    fprintf(stdout, "\n");
+                    for (const auto & cmd : probs_id) {
+                        fprintf(stdout, "%s: %s%-*s%s = %f | ", __func__, "\033[1m", max_len, allowed_commands[cmd.second].c_str(), "\033[0m", cmd.first);
+                        for (int token : allowed_tokens[cmd.second]) {
+                            fprintf(stdout, "'%4s' %f ", whisper_token_to_str(ctx, token), probs[token]);
+                        }
+                        fprintf(stdout, "\n");
+                    }
+                }
+
+                // best command
+                {
+                    const auto t_end = std::chrono::high_resolution_clock::now();
+
+                    const float prob = probs_id[0].first;
+                    const int index = probs_id[0].second;
+
+                    fprintf(stdout, "\n");
+                    fprintf(stdout, "%s: detected command: %s%s%s | p = %f | t = %d ms\n", __func__,
+                            "\033[1m", allowed_commands[index].c_str(), "\033[0m", prob,
+                            (int) std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count());
+                    fprintf(stdout, "\n");
+                }
+            }
+
+            audio.clear();
+        }
+    }
+
+    return 0;
+}
+
+// always-prompt mode
+// transcribe the voice into text after valid prompt
+int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
+    bool is_running = true;
+    bool ask_prompt = true;
+
+    float prob = 0.0f;
+
+    std::vector<float> pcmf32_cur;
+
+    const std::string k_prompt = params.prompt;
+
+    const int k_prompt_length = get_words(k_prompt).size();
+
+    fprintf(stderr, "\n");
+    fprintf(stderr, "%s: always-prompt mode\n", __func__);
+
+    // main loop
+    while (is_running) {
+        // handle Ctrl + C
+        is_running = process_sdl_events();
+
+        // delay
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        if (ask_prompt) {
+            fprintf(stdout, "\n");
+            fprintf(stdout, "%s: The prompt is: '%s%s%s'\n", __func__, "\033[1m", k_prompt.c_str(), "\033[0m");
+            fprintf(stdout, "\n");
+
+            ask_prompt = false;
+        }
+
+        {
+            audio.get(2000, pcmf32_cur);
+
+            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
+                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
+
+                int64_t t_ms = 0;
+
+                // detect the commands
+                audio.get(params.command_ms, pcmf32_cur);
+
+                const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob, t_ms));
+
+                const auto words = get_words(txt);
+
+                std::string prompt;
+                std::string command;
+
+                for (int i = 0; i < (int) words.size(); ++i) {
+                    if (i < k_prompt_length) {
+                        prompt += words[i] + " ";
+                    } else {
+                        command += words[i] + " ";
+                    }
+                }
+
+                const float sim = similarity(prompt, k_prompt);
+
+                //debug
+                //fprintf(stdout, "command size: %i\n", command_length);
+
+                if ((sim > 0.7f) && (command.size() > 0)) {
+                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
+                }
+
+                fprintf(stdout, "\n");
+
+                audio.clear();
+            }
+        }
+    }
+
+    return 0;
+}
+
+// general-purpose mode
+// freely transcribe the voice into text
+int process_general_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
    bool is_running  = true;
    bool have_prompt = false;
    bool ask_prompt  = true;
@ -544,26 +848,13 @@ int main(int argc, char ** argv) {

    const std::string k_prompt = "Ok Whisper, start listening for commands.";

+    fprintf(stderr, "\n");
+    fprintf(stderr, "%s: general-purpose mode\n", __func__);
+
    // main loop
    while (is_running) {
        // handle Ctrl + C
-        {
-            SDL_Event event;
-            while (SDL_PollEvent(&event)) {
-                switch (event.type) {
-                    case SDL_QUIT:
-                        {
-                            is_running = false;
-                        } break;
-                    default:
-                        break;
-                }
-            }
-
-            if (!is_running) {
-                break;
-            }
-        }
+        is_running = process_sdl_events();

        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
@ -576,15 +867,16 @@ int main(int argc, char ** argv) {
            ask_prompt = false;
        }

-        int64_t t_ms = 0;
-
        {
            audio.get(2000, pcmf32_cur);

            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);

+                int64_t t_ms = 0;
+
                if (!have_prompt) {
+                    // wait for activation phrase
                    audio.get(params.prompt_ms, pcmf32_cur);

                    const auto txt = ::trim(::transcribe(ctx, params, pcmf32_cur, prob0, t_ms));
@ -607,6 +899,7 @@ int main(int argc, char ** argv) {
                        have_prompt = true;
                    }
                } else {
+                    // we have heard the activation phrase, now detect the commands
                    audio.get(params.command_ms, pcmf32_cur);

                    // prepend the prompt audio
@ -645,10 +938,74 @@ int main(int argc, char ** argv) {
        }
    }

+    return 0;
+}
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
+    // whisper init
+
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
+
+    // print some info about the processing
+    {
+        fprintf(stderr, "\n");
+        if (!whisper_is_multilingual(ctx)) {
+            if (params.language != "en" || params.translate) {
+                params.language = "en";
+                params.translate = false;
+                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+            }
+        }
+        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+                __func__,
+                params.n_threads,
+                params.language.c_str(),
+                params.translate ? "translate" : "transcribe",
+                params.no_timestamps ? 0 : 1);
+
+        fprintf(stderr, "\n");
+    }
+
+    // init audio
+
+    audio_async audio(30*1000);
+    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
+        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
+        return 1;
+    }
+
+    audio.resume();
+
+    // wait for 1 second to avoid any buffered noise
+    std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+    audio.clear();
+
+    int  ret_val = 0;
+
+    if (!params.commands.empty()) {
+        ret_val = process_command_list(ctx, audio, params);
+    } else if (!params.prompt.empty()) {
+        ret_val = always_prompt_transcription(ctx, audio, params);
+    } else {
+        ret_val = process_general_transcription(ctx, audio, params);
+    }
+
    audio.pause();

    whisper_print_timings(ctx);
    whisper_free(ctx);

-    return 0;
+    return ret_val;
 }
--- a/examples/command/commands.txt
+++ b/examples/command/commands.txt
@ -0,0 +1,9 @@
+enable
+disable
+cat
+dog
+apple
+red
+blue
+green
+lightblue
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@ -1,3 +1,6 @@
 set(TARGET main)
 add_executable(${TARGET} main.cpp)
+
+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -59,22 +59,29 @@ struct whisper_params {
    int32_t duration_ms  = 0;
    int32_t max_context  = -1;
    int32_t max_len      = 0;
+    int32_t best_of      = 5;
+    int32_t beam_size    = -1;

-    float word_thold = 0.01f;
+    float word_thold    = 0.01f;
+    float entropy_thold = 2.4f;
+    float logprob_thold = -1.0f;

-    bool speed_up      = false;
-    bool translate     = false;
-    bool diarize       = false;
-    bool output_txt    = false;
-    bool output_vtt    = false;
-    bool output_srt    = false;
-    bool output_wts    = false;
-    bool print_special = false;
-    bool print_colors  = false;
-    bool no_timestamps = false;
+    bool speed_up       = false;
+    bool translate      = false;
+    bool diarize        = false;
+    bool output_txt     = false;
+    bool output_vtt     = false;
+    bool output_srt     = false;
+    bool output_wts     = false;
+    bool output_csv     = false;
+    bool print_special  = false;
+    bool print_colors   = false;
+    bool print_progress = false;
+    bool no_timestamps  = false;

-    std::string language  = "en";
-    std::string model     = "models/ggml-base.en.bin";
+    std::string language = "en";
+    std::string prompt;
+    std::string model    = "models/ggml-base.en.bin";

    std::vector<std::string> fname_inp = {};
 };
@ -94,27 +101,34 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
-        else if (arg == "-t"    || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
-        else if (arg == "-p"    || arg == "--processors")    { params.n_processors  = std::stoi(argv[++i]); }
-        else if (arg == "-ot"   || arg == "--offset-t")      { params.offset_t_ms   = std::stoi(argv[++i]); }
-        else if (arg == "-on"   || arg == "--offset-n")      { params.offset_n      = std::stoi(argv[++i]); }
-        else if (arg == "-d"    || arg == "--duration")      { params.duration_ms   = std::stoi(argv[++i]); }
-        else if (arg == "-mc"   || arg == "--max-context")   { params.max_context   = std::stoi(argv[++i]); }
-        else if (arg == "-ml"   || arg == "--max-len")       { params.max_len       = std::stoi(argv[++i]); }
-        else if (arg == "-wt"   || arg == "--word-thold")    { params.word_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"   || arg == "--speed-up")      { params.speed_up      = true; }
-        else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
-        else if (arg == "-di"   || arg == "--diarize")       { params.diarize       = true; }
-        else if (arg == "-otxt" || arg == "--output-txt")    { params.output_txt    = true; }
-        else if (arg == "-ovtt" || arg == "--output-vtt")    { params.output_vtt    = true; }
-        else if (arg == "-osrt" || arg == "--output-srt")    { params.output_srt    = true; }
-        else if (arg == "-owts" || arg == "--output-words")  { params.output_wts    = true; }
-        else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
-        else if (arg == "-pc"   || arg == "--print-colors")  { params.print_colors  = true; }
-        else if (arg == "-nt"   || arg == "--no-timestamps") { params.no_timestamps = true; }
-        else if (arg == "-l"    || arg == "--language")      { params.language      = argv[++i]; }
-        else if (arg == "-m"    || arg == "--model")         { params.model         = argv[++i]; }
-        else if (arg == "-f"    || arg == "--file")          { params.fname_inp.push_back(argv[++i]); }
+        else if (arg == "-t"    || arg == "--threads")        { params.n_threads      = std::stoi(argv[++i]); }
+        else if (arg == "-p"    || arg == "--processors")     { params.n_processors   = std::stoi(argv[++i]); }
+        else if (arg == "-ot"   || arg == "--offset-t")       { params.offset_t_ms    = std::stoi(argv[++i]); }
+        else if (arg == "-on"   || arg == "--offset-n")       { params.offset_n       = std::stoi(argv[++i]); }
+        else if (arg == "-d"    || arg == "--duration")       { params.duration_ms    = std::stoi(argv[++i]); }
+        else if (arg == "-mc"   || arg == "--max-context")    { params.max_context    = std::stoi(argv[++i]); }
+        else if (arg == "-ml"   || arg == "--max-len")        { params.max_len        = std::stoi(argv[++i]); }
+        else if (arg == "-bo"   || arg == "--best-of")        { params.best_of        = std::stoi(argv[++i]); }
+        else if (arg == "-bs"   || arg == "--beam-size")      { params.beam_size      = std::stoi(argv[++i]); }
+        else if (arg == "-wt"   || arg == "--word-thold")     { params.word_thold     = std::stof(argv[++i]); }
+        else if (arg == "-et"   || arg == "--entropy-thold")  { params.entropy_thold  = std::stof(argv[++i]); }
+        else if (arg == "-lpt"  || arg == "--logprob-thold")  { params.logprob_thold  = std::stof(argv[++i]); }
+        else if (arg == "-su"   || arg == "--speed-up")       { params.speed_up       = true; }
+        else if (arg == "-tr"   || arg == "--translate")      { params.translate      = true; }
+        else if (arg == "-di"   || arg == "--diarize")        { params.diarize        = true; }
+        else if (arg == "-otxt" || arg == "--output-txt")     { params.output_txt     = true; }
+        else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
+        else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
+        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
+        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
+        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
+        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
+        else if (arg == "-pp"   || arg == "--print-progress") { params.print_progress = true; }
+        else if (arg == "-nt"   || arg == "--no-timestamps")  { params.no_timestamps  = true; }
+        else if (arg == "-l"    || arg == "--language")       { params.language       = argv[++i]; }
+        else if (                  arg == "--prompt")         { params.prompt         = argv[++i]; }
+        else if (arg == "-m"    || arg == "--model")          { params.model          = argv[++i]; }
+        else if (arg == "-f"    || arg == "--file")           { params.fname_inp.emplace_back(argv[++i]); }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
@ -125,33 +139,40 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    return true;
 }

-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n",    params.n_threads);
-    fprintf(stderr, "  -p N,     --processors N  [%-7d] number of processors to use during computation\n", params.n_processors);
-    fprintf(stderr, "  -ot N,    --offset-t N    [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
-    fprintf(stderr, "  -on N,    --offset-n N    [%-7d] segment index offset\n",                           params.offset_n);
-    fprintf(stderr, "  -d  N,    --duration N    [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
-    fprintf(stderr, "  -mc N,    --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
-    fprintf(stderr, "  -ml N,    --max-len N     [%-7d] maximum segment length in characters\n",           params.max_len);
-    fprintf(stderr, "  -wt N,    --word-thold N  [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
-    fprintf(stderr, "  -di,      --diarize       [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
-    fprintf(stderr, "  -otxt,    --output-txt    [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
-    fprintf(stderr, "  -ovtt,    --output-vtt    [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
-    fprintf(stderr, "  -osrt,    --output-srt    [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
-    fprintf(stderr, "  -owts,    --output-words  [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pc,      --print-colors  [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
-    fprintf(stderr, "  -nt,      --no-timestamps [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
-    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                                params.language.c_str());
-    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                     params.model.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] input WAV file path\n",                            "");
+    fprintf(stderr, "  -h,       --help            [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N       [%-7d] number of threads to use during computation\n",    params.n_threads);
+    fprintf(stderr, "  -p N,     --processors N    [%-7d] number of processors to use during computation\n", params.n_processors);
+    fprintf(stderr, "  -ot N,    --offset-t N      [%-7d] time offset in milliseconds\n",                    params.offset_t_ms);
+    fprintf(stderr, "  -on N,    --offset-n N      [%-7d] segment index offset\n",                           params.offset_n);
+    fprintf(stderr, "  -d  N,    --duration N      [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
+    fprintf(stderr, "  -mc N,    --max-context N   [%-7d] maximum number of text context tokens to store\n", params.max_context);
+    fprintf(stderr, "  -ml N,    --max-len N       [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -bo N,    --best-of N       [%-7d] number of best candidates to keep\n",              params.best_of);
+    fprintf(stderr, "  -bs N,    --beam-size N     [%-7d] beam size for beam search\n",                      params.beam_size);
+    fprintf(stderr, "  -wt N,    --word-thold N    [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
+    fprintf(stderr, "  -et N,    --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
+    fprintf(stderr, "  -lpt N,   --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
+    fprintf(stderr, "  -su,      --speed-up        [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate       [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
+    fprintf(stderr, "  -di,      --diarize         [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
+    fprintf(stderr, "  -otxt,    --output-txt      [%-7s] output result in a text file\n",                   params.output_txt ? "true" : "false");
+    fprintf(stderr, "  -ovtt,    --output-vtt      [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
+    fprintf(stderr, "  -osrt,    --output-srt      [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
+    fprintf(stderr, "  -owts,    --output-words    [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
+    fprintf(stderr, "  -ocsv,    --output-csv      [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special   [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pc,      --print-colors    [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
+    fprintf(stderr, "  -pp,      --print-progress  [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
+    fprintf(stderr, "  -nt,      --no-timestamps   [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
+    fprintf(stderr, "  -l LANG,  --language LANG   [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
+    fprintf(stderr, "            --prompt PROMPT   [%-7s] initial prompt\n",                                 params.prompt.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME     [%-7s] model path\n",                                     params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME      [%-7s] input WAV file path\n",                            "");
    fprintf(stderr, "\n");
 }

@ -167,90 +188,81 @@ void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, voi

    const int n_segments = whisper_full_n_segments(ctx);

+    std::string speaker = "";
+
+    int64_t t0;
+    int64_t t1;
+
    // print the last n_new segments
    const int s0 = n_segments - n_new;
+
    if (s0 == 0) {
        printf("\n");
    }

    for (int i = s0; i < n_segments; i++) {
-        if (params.no_timestamps) {
-            if (params.print_colors) {
-                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special == false) {
-                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                        if (id >= whisper_token_eot(ctx)) {
-                            continue;
-                        }
-                    }
-
-                    const char * text = whisper_full_get_token_text(ctx, i, j);
-                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
-
-                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
-
-                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
-                }
-            } else {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                printf("%s", text);
-            }
-            fflush(stdout);
-        } else {
-            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-            std::string speaker = "";
-
-            if (params.diarize && pcmf32s.size() == 2) {
-                const int64_t n_samples = pcmf32s[0].size();
-
-                const int64_t is0 = timestamp_to_sample(t0, n_samples);
-                const int64_t is1 = timestamp_to_sample(t1, n_samples);
-
-                double energy0 = 0.0f;
-                double energy1 = 0.0f;
-
-                for (int64_t j = is0; j < is1; j++) {
-                    energy0 += fabs(pcmf32s[0][j]);
-                    energy1 += fabs(pcmf32s[1][j]);
-                }
-
-                if (energy0 > 1.1*energy1) {
-                    speaker = "(speaker 0)";
-                } else if (energy1 > 1.1*energy0) {
-                    speaker = "(speaker 1)";
-                } else {
-                    speaker = "(speaker ?)";
-                }
-
-                //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
-            }
-
-            if (params.print_colors) {
-                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
-                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
-                    if (params.print_special == false) {
-                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
-                        if (id >= whisper_token_eot(ctx)) {
-                            continue;
-                        }
-                    }
-
-                    const char * text = whisper_full_get_token_text(ctx, i, j);
-                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
-
-                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
-
-                    printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
-                }
-                printf("\n");
-            } else {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-
-                printf("[%s --> %s]  %s%s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), speaker.c_str(), text);
-            }
+        if (!params.no_timestamps || params.diarize) {
+            t0 = whisper_full_get_segment_t0(ctx, i);
+            t1 = whisper_full_get_segment_t1(ctx, i);
        }
+
+        if (!params.no_timestamps) {
+            printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
+        }
+
+        if (params.diarize && pcmf32s.size() == 2) {
+            const int64_t n_samples = pcmf32s[0].size();
+
+            const int64_t is0 = timestamp_to_sample(t0, n_samples);
+            const int64_t is1 = timestamp_to_sample(t1, n_samples);
+
+            double energy0 = 0.0f;
+            double energy1 = 0.0f;
+
+            for (int64_t j = is0; j < is1; j++) {
+                energy0 += fabs(pcmf32s[0][j]);
+                energy1 += fabs(pcmf32s[1][j]);
+            }
+
+            if (energy0 > 1.1*energy1) {
+                speaker = "(speaker 0)";
+            } else if (energy1 > 1.1*energy0) {
+                speaker = "(speaker 1)";
+            } else {
+                speaker = "(speaker ?)";
+            }
+
+            //printf("is0 = %lld, is1 = %lld, energy0 = %f, energy1 = %f, %s\n", is0, is1, energy0, energy1, speaker.c_str());
+        }
+
+        if (params.print_colors) {
+            for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
+                if (params.print_special == false) {
+                    const whisper_token id = whisper_full_get_token_id(ctx, i, j);
+                    if (id >= whisper_token_eot(ctx)) {
+                        continue;
+                    }
+                }
+
+                const char * text = whisper_full_get_token_text(ctx, i, j);
+                const float  p    = whisper_full_get_token_p   (ctx, i, j);
+
+                const int col = std::max(0, std::min((int) k_colors.size() - 1, (int) (std::pow(p, 3)*float(k_colors.size()))));
+
+                printf("%s%s%s%s", speaker.c_str(), k_colors[col].c_str(), text, "\033[0m");
+            }
+        } else {
+            const char * text = whisper_full_get_segment_text(ctx, i);
+
+            printf("%s%s", speaker.c_str(), text);
+        }
+
+        // with timestamps or speakers: each segment on new line
+        if (!params.no_timestamps || params.diarize) {
+            printf("\n");
+        }
+
+        fflush(stdout);
    }
 }

@ -319,10 +331,35 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

+bool output_csv(struct whisper_context * ctx, const char * fname) {
+    std::ofstream fout(fname);
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+        if (text[0] == ' ') {
+            text = text + sizeof(char); //whisper_full_get_segment_text() returns a string with leading space, point to the next character.
+        }
+        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
+        fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text    << "\"\n";
+    }
+
+    return true;
+}
+
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
+bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
    std::ofstream fout(fname);

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
@ -371,7 +408,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
            txt_ul = "\\ \\ ";

            {
-                int ncnt = 0;
                for (int k = 0; k < n; ++k) {
                    const auto & token2 = tokens[k];

@ -395,8 +431,6 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
                            txt_ul += "\\ ";
                        }
                    }
-
-                    ncnt += txt.size();
                }

                ::replace_all(txt_bg, "'", "\u2019");
@ -447,7 +481,7 @@ int main(int argc, char ** argv) {
        return 2;
    }

-    if (whisper_lang_id(params.language.c_str()) == -1) {
+    if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
@ -455,13 +489,29 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context * ctx = whisper_init(params.model.c_str());
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());

    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 3;
    }

+    // initial prompt
+    std::vector<whisper_token> prompt_tokens;
+
+    if (!params.prompt.empty()) {
+        prompt_tokens.resize(1024);
+        prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
+
+        fprintf(stderr, "\n");
+        fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
+        fprintf(stderr, "initial tokens: [ ");
+        for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
+            fprintf(stderr, "%d ", prompt_tokens[i]);
+        }
+        fprintf(stderr, "]\n");
+    }
+
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];

@ -486,14 +536,14 @@ int main(int argc, char ** argv) {
                    }
                }

-                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), NULL) == false) {
+                if (drwav_init_memory(&wav, wav_data.data(), wav_data.size(), nullptr) == false) {
                    fprintf(stderr, "error: failed to open WAV file from stdin\n");
                    return 4;
                }

                fprintf(stderr, "%s: read %zu bytes from stdin\n", __func__, wav_data.size());
            }
-            else if (drwav_init_file(&wav, fname_inp.c_str(), NULL) == false) {
+            else if (drwav_init_file(&wav, fname_inp.c_str(), nullptr) == false) {
                fprintf(stderr, "error: failed to open '%s' as WAV file\n", fname_inp.c_str());
                return 5;
            }
@ -509,7 +559,7 @@ int main(int argc, char ** argv) {
            }

            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
-                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
+                fprintf(stderr, "%s: WAV file '%s' must be %i kHz\n", argv[0], fname_inp.c_str(), WHISPER_SAMPLE_RATE/1000);
                return 8;
            }

@ -528,11 +578,11 @@ int main(int argc, char ** argv) {
            // convert to mono, float
            pcmf32.resize(n);
            if (wav.channels == 1) {
-                for (int i = 0; i < n; i++) {
+                for (uint64_t i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[i])/32768.0f;
                }
            } else {
-                for (int i = 0; i < n; i++) {
+                for (uint64_t i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
                }
            }
@ -543,7 +593,7 @@ int main(int argc, char ** argv) {

                pcmf32s[0].resize(n);
                pcmf32s[1].resize(n);
-                for (int i = 0; i < n; i++) {
+                for (uint64_t i = 0; i < n; i++) {
                    pcmf32s[0][i] = float(pcm16[2*i])/32768.0f;
                    pcmf32s[1][i] = float(pcm16[2*i + 1])/32768.0f;
                }
@ -577,13 +627,14 @@ int main(int argc, char ** argv) {
            fprintf(stderr, "\n");
        }

-
        // run the inference
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);

+            wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
+
            wparams.print_realtime   = false;
-            wparams.print_progress   = false;
+            wparams.print_progress   = params.print_progress;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.print_special    = params.print_special;
            wparams.translate        = params.translate;
@ -595,10 +646,19 @@ int main(int argc, char ** argv) {

            wparams.token_timestamps = params.output_wts || params.max_len > 0;
            wparams.thold_pt         = params.word_thold;
+            wparams.entropy_thold    = params.entropy_thold;
+            wparams.logprob_thold    = params.logprob_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;

            wparams.speed_up         = params.speed_up;

+            wparams.greedy.best_of        = params.best_of;
+            wparams.beam_search.beam_size = params.beam_size;
+            wparams.temperature_inc = -1;
+
+            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
+
            whisper_print_user_data user_data = { &params, &pcmf32s };

            // this callback is called on each new segment
@ -613,7 +673,7 @@ int main(int argc, char ** argv) {
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race

-                wparams.encoder_begin_callback = [](struct whisper_context * ctx, void * user_data) {
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
@ -653,6 +713,13 @@ int main(int argc, char ** argv) {
                const auto fname_wts = fname_inp + ".wts";
                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
            }
+
+	    // output to CSV file
+            if (params.output_csv) {
+                const auto fname_csv = fname_inp + ".csv";
+                output_csv(ctx, fname_csv.c_str());
+            }
+
        }
    }

--- a/examples/stream.wasm/CMakeLists.txt
+++ b/examples/stream.wasm/CMakeLists.txt
@ -8,6 +8,8 @@ add_executable(${TARGET}
    emscripten.cpp
    )

+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
--- a/examples/stream.wasm/emscripten.cpp
+++ b/examples/stream.wasm/emscripten.cpp
@ -49,6 +49,9 @@ void stream_main(size_t index) {
    wparams.max_tokens       = 32;
    wparams.audio_ctx        = 768; // partial encoder context for better performance

+    // disable temperature fallback
+    wparams.temperature_inc  = -1.0f;
+
    wparams.language         = "en";

    printf("stream: using %d threads\n", wparams.n_threads);
@ -129,7 +132,7 @@ EMSCRIPTEN_BINDINGS(stream) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/stream/CMakeLists.txt
+++ b/examples/stream/CMakeLists.txt
@ -2,6 +2,9 @@ if (WHISPER_SUPPORT_SDL2)
    # stream
    set(TARGET stream)
    add_executable(${TARGET} stream.cpp)
+
+    include(DefaultTargetOptions)
+
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@ -10,6 +10,23 @@ More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/i

 https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4

+## Sliding window mode with VAD
+
+Setting the `--step` argument to `0` enables the sliding window mode:
+
+```java
+ ./stream -m ./models/ggml-small.en.bin -t 6 --step 0 --length 30000 -vth 0.6
+```
+
+In this mode, the tool will transcribe only after some speech activity is detected. A very
+basic VAD detector is used, but in theory a more sophisticated approach can be added. The
+`-vth` argument determines the VAD threshold - higher values will make it detect silence more often.
+It's best to tune it to the specific use case, but a value around `0.6` should be OK in general.
+When silence is detected, it will transcribe the last `--length` milliseconds of audio and output
+a transcription block that is suitable for parsing.
+
+## Building
+
 The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:

 ```bash
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -1,18 +1,21 @@
 // Real-time speech recognition of input from a microphone
 //
 // A very quick-n-dirty implementation serving mainly as a proof of concept.
+//

 #include "whisper.h"

 #include <SDL.h>
 #include <SDL_audio.h>

+#include <atomic>
 #include <cassert>
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
 #include <fstream>
+#include <mutex>

 //  500 -> 00:05.000
 // 6000 -> 01:00.000
@ -33,19 +36,23 @@ struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t step_ms    = 3000;
    int32_t length_ms  = 10000;
+    int32_t keep_ms    = 200;
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;

+    float vad_thold    = 0.6f;
+    float freq_thold   = 100.0f;
+
    bool speed_up      = false;
    bool translate     = false;
-    bool no_context    = true;
    bool print_special = false;
-    bool no_timestamps = true;
+    bool no_context    = true;
+    bool no_timestamps = false;

    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
-    std::string fname_out = "";
+    std::string fname_out;
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -61,13 +68,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
        else if (                 arg == "--step")          { params.step_ms       = std::stoi(argv[++i]); }
        else if (                 arg == "--length")        { params.length_ms     = std::stoi(argv[++i]); }
+        else if (                 arg == "--keep")          { params.keep_ms       = std::stoi(argv[++i]); }
        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
+        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
+        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
-        else if (arg == "-kc"  || arg == "--keep-context")  { params.no_context    = false; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-kc"  || arg == "--keep-context")  { params.no_context    = false; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
@ -81,7 +91,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    return true;
 }

-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
@ -90,13 +100,16 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "            --step N        [%-7d] audio step size in milliseconds\n",             params.step_ms);
    fprintf(stderr, "            --length N      [%-7d] audio length in milliseconds\n",                params.length_ms);
+    fprintf(stderr, "            --keep N        [%-7d] audio to keep from previous step in ms\n",      params.keep_ms);
    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
+    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -kc,      --keep-context  [%-7s] keep context between audio chunks\n",           params.no_context ? "false" : "true");
    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -kc,      --keep-context  [%-7s] keep context between audio chunks\n",           params.no_context ? "false" : "true");
    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
@ -107,19 +120,58 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
 // SDL Audio capture
 //

-SDL_AudioDeviceID g_dev_id_in = 0;
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();

-bool audio_sdl_init(const int capture_id) {
-    if (g_dev_id_in) {
-        fprintf(stderr, "%s: already initialized\n", __func__);
-        return false;
+    bool init(int capture_id, int sample_rate);
+
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    std::atomic_bool m_running;
+    std::mutex       m_mutex;
+
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+
+    m_running = false;
+}
+
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
    }
+}

+bool audio_async::init(int capture_id, int sample_rate) {
    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);

    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
-        return (1);
+        return false;
    }

    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
@ -138,34 +190,232 @@ bool audio_sdl_init(const int capture_id) {
    SDL_zero(capture_spec_requested);
    SDL_zero(capture_spec_obtained);

-    capture_spec_requested.freq     = WHISPER_SAMPLE_RATE;
+    capture_spec_requested.freq     = sample_rate;
    capture_spec_requested.format   = AUDIO_F32;
    capture_spec_requested.channels = 1;
    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;

    if (capture_id >= 0) {
        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
-        g_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    } else {
        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
-        g_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
    }
-    if (!g_dev_id_in) {
+
+    if (!m_dev_id_in) {
        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
-        g_dev_id_in = 0;
+        m_dev_id_in = 0;
+
+        return false;
    } else {
-        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, g_dev_id_in);
-        fprintf(stderr, "%s:     - sample rate:       %d\n", __func__, capture_spec_obtained.freq);
-        fprintf(stderr, "%s:     - format:            %d (required: %d)\n", __func__, capture_spec_obtained.format, capture_spec_requested.format);
-        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
-        fprintf(stderr, "%s:     - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+    }
+
+    m_sample_rate = capture_spec_obtained.freq;
+
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+
+    return true;
+}
+
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+
+    m_running = true;
+
+    return true;
+}
+
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+
+    m_running = false;
+
+    return true;
+}
+
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        m_audio_pos = 0;
+        m_audio_len = 0;
    }

    return true;
 }

+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+
+    const size_t n_samples = len / sizeof(float);
+
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+
 ///////////////////////////

+void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
+    const float rc = 1.0f / (2.0f * M_PI * cutoff);
+    const float dt = 1.0f / sample_rate;
+    const float alpha = dt / (rc + dt);
+
+    float y = data[0];
+
+    for (size_t i = 1; i < data.size(); i++) {
+        y = alpha * (y + data[i] - data[i - 1]);
+        data[i] = y;
+    }
+}
+
+bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
+    const int n_samples      = pcmf32.size();
+    const int n_samples_last = (sample_rate * last_ms) / 1000;
+
+    if (n_samples_last >= n_samples) {
+        // not enough samples - assume no speech
+        return false;
+    }
+
+    if (freq_thold > 0.0f) {
+        high_pass_filter(pcmf32, freq_thold, sample_rate);
+    }
+
+    float energy_all  = 0.0f;
+    float energy_last = 0.0f;
+
+    for (int i = 0; i < n_samples; i++) {
+        energy_all += fabsf(pcmf32[i]);
+
+        if (i >= n_samples - n_samples_last) {
+            energy_last += fabsf(pcmf32[i]);
+        }
+    }
+
+    energy_all  /= n_samples;
+    energy_last /= n_samples_last;
+
+    if (verbose) {
+        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
+    }
+
+    if (energy_last > vad_thold*energy_all) {
+        return false;
+    }
+
+    return true;
+}
+
 int main(int argc, char ** argv) {
    whisper_params params;

@ -173,33 +423,46 @@ int main(int argc, char ** argv) {
        return 1;
    }

+    params.keep_ms = std::min(params.keep_ms, params.step_ms); // cannot be more than step_ms
+
+    const int n_samples_step = (params.step_ms  *1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_len  = (params.length_ms*1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_keep = (params.keep_ms  *1e-3)*WHISPER_SAMPLE_RATE;
+    const int n_samples_30s  = (30000           *1e-3)*WHISPER_SAMPLE_RATE;
+
+    const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
+
+    const int n_new_line = !use_vad ? params.length_ms / params.step_ms - 1 : 1; // number of steps to print new line
+
+    params.no_timestamps  = !use_vad;
+    params.no_context    |= use_vad;
+    params.max_tokens     = 0;
+
    // init audio

-    if (!audio_sdl_init(params.capture_id)) {
-        fprintf(stderr, "%s: audio_sdl_init() failed!\n", __func__);
+    audio_async audio(params.length_ms);
+    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
+        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
        return 1;
    }

+    audio.resume();
+
+    // whisper init
+
    if (whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
    }

-    // whisper init
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());

-    struct whisper_context * ctx = whisper_init(params.model.c_str());
-
-    const int n_samples = (params.step_ms/1000.0)*WHISPER_SAMPLE_RATE;
-    const int n_samples_len = (params.length_ms/1000.0)*WHISPER_SAMPLE_RATE;
-    const int n_samples_30s = 30*WHISPER_SAMPLE_RATE;
-    const int n_samples_keep = 0.2*WHISPER_SAMPLE_RATE;
-
-    std::vector<float> pcmf32(n_samples_30s, 0.0f);
+    std::vector<float> pcmf32    (n_samples_30s, 0.0f);
    std::vector<float> pcmf32_old;
+    std::vector<float> pcmf32_new(n_samples_30s, 0.0f);

    std::vector<whisper_token> prompt_tokens;
-    const int n_new_line = params.length_ms / params.step_ms - 1;

    // print some info about the processing
    {
@ -211,23 +474,28 @@ int main(int argc, char ** argv) {
                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
            }
        }
-        fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+        fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
                __func__,
-                n_samples,
-                float(n_samples)/WHISPER_SAMPLE_RATE,
-                float(n_samples_len)/WHISPER_SAMPLE_RATE,
+                n_samples_step,
+                float(n_samples_step)/WHISPER_SAMPLE_RATE,
+                float(n_samples_len )/WHISPER_SAMPLE_RATE,
+                float(n_samples_keep)/WHISPER_SAMPLE_RATE,
                params.n_threads,
                params.language.c_str(),
                params.translate ? "translate" : "transcribe",
                params.no_timestamps ? 0 : 1);

-        fprintf(stderr, "%s: n_new_line = %d\n", __func__, n_new_line);
+        if (!use_vad) {
+            fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
+        } else {
+            fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
+        }
+
        fprintf(stderr, "\n");
    }

-    SDL_PauseAudioDevice(g_dev_id_in, 0);
-
    int n_iter = 0;
+
    bool is_running = true;

    std::ofstream fout;
@ -242,6 +510,9 @@ int main(int argc, char ** argv) {
    printf("[Start speaking]");
    fflush(stdout);

+          auto t_last  = std::chrono::high_resolution_clock::now();
+    const auto t_start = t_last;
+
    // main audio loop
    while (is_running) {
        // handle Ctrl + C
@ -268,35 +539,64 @@ int main(int argc, char ** argv) {
        }

        // process new audio
-        if (n_iter > 0 && SDL_GetQueuedAudioSize(g_dev_id_in) > 2*n_samples*sizeof(float)) {
-            fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
-            SDL_ClearQueuedAudio(g_dev_id_in);
+
+        if (!use_vad) {
+            while (true) {
+                audio.get(params.step_ms, pcmf32_new);
+
+                if ((int) pcmf32_new.size() > 2*n_samples_step) {
+                    fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
+                    audio.clear();
+                    continue;
+                }
+
+                if ((int) pcmf32_new.size() >= n_samples_step) {
+                    audio.clear();
+                    break;
+                }
+
+                SDL_Delay(1);
+            }
+
+            const int n_samples_new = pcmf32_new.size();
+
+            // take up to params.length_ms audio from previous iteration
+            const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
+
+            //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
+
+            pcmf32.resize(n_samples_new + n_samples_take);
+
+            for (int i = 0; i < n_samples_take; i++) {
+                pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
+            }
+
+            memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
+
+            pcmf32_old = pcmf32;
+        } else {
+            const auto t_now  = std::chrono::high_resolution_clock::now();
+            const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
+
+            if (t_diff < 2000) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+                continue;
+            }
+
+            audio.get(2000, pcmf32_new);
+
+            if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
+                audio.get(params.length_ms, pcmf32);
+            } else {
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+                continue;
+            }
+
+            t_last = t_now;
        }

-        while (SDL_GetQueuedAudioSize(g_dev_id_in) < n_samples*sizeof(float)) {
-            SDL_Delay(1);
-        }
-
-        const int n_samples_new = SDL_GetQueuedAudioSize(g_dev_id_in)/sizeof(float);
-
-        // take one second from previous iteration
-        //const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new));
-
-        // take up to params.length_ms audio from previous iteration
-        const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
-
-        //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
-
-        pcmf32.resize(n_samples_new + n_samples_take);
-
-        for (int i = 0; i < n_samples_take; i++) {
-            pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
-        }
-
-        SDL_DequeueAudio(g_dev_id_in, pcmf32.data() + n_samples_take, n_samples_new*sizeof(float));
-
-        pcmf32_old = pcmf32;
-
        // run the inference
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
@ -307,7 +607,7 @@ int main(int argc, char ** argv) {
            wparams.print_timestamps = !params.no_timestamps;
            wparams.translate        = params.translate;
            wparams.no_context       = true;
-            wparams.single_segment   = true;
+            wparams.single_segment   = !use_vad;
            wparams.max_tokens       = params.max_tokens;
            wparams.language         = params.language.c_str();
            wparams.n_threads        = params.n_threads;
@ -315,6 +615,9 @@ int main(int argc, char ** argv) {
            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;

+            // disable temperature fallback
+            wparams.temperature_inc  = -1.0f;
+
            wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
            wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();

@ -325,12 +628,21 @@ int main(int argc, char ** argv) {

            // print result;
            {
-                printf("\33[2K\r");
+                if (!use_vad) {
+                    printf("\33[2K\r");

-                // print long empty line to clear the previous line
-                printf("%s", std::string(100, ' ').c_str());
+                    // print long empty line to clear the previous line
+                    printf("%s", std::string(100, ' ').c_str());

-                printf("\33[2K\r");
+                    printf("\33[2K\r");
+                } else {
+                    const int64_t t1 = (t_last - t_start).count()/1000000;
+                    const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
+
+                    printf("\n");
+                    printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
+                    printf("\n");
+                }

                const int n_segments = whisper_full_n_segments(ctx);
                for (int i = 0; i < n_segments; ++i) {
@ -358,11 +670,16 @@ int main(int argc, char ** argv) {
                if (params.fname_out.length() > 0) {
                    fout << std::endl;
                }
+
+                if (use_vad){
+                    printf("\n");
+                    printf("### Transcription %d END\n", n_iter);
+                }
            }

            ++n_iter;

-            if ((n_iter % n_new_line) == 0) {
+            if (!use_vad && (n_iter % n_new_line) == 0) {
                printf("\n");

                // keep part of the audio for next iteration to try to mitigate word boundary issues
@ -384,9 +701,7 @@ int main(int argc, char ** argv) {
        }
    }

-    if (g_dev_id_in >= 0) {
-        SDL_CloseAudioDevice(g_dev_id_in);
-    }
+    audio.pause();

    whisper_print_timings(ctx);
    whisper_free(ctx);
--- a/examples/talk.wasm/CMakeLists.txt
+++ b/examples/talk.wasm/CMakeLists.txt
@ -9,6 +9,8 @@ add_executable(${TARGET}
    gpt-2.cpp
    )

+include(DefaultTargetOptions)
+
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
@ -31,8 +33,8 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1600MB \
-    -s TOTAL_MEMORY=1600MB \
+    -s INITIAL_MEMORY=1800MB \
+    -s TOTAL_MEMORY=1800MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/talk.wasm/README.md
+++ b/examples/talk.wasm/README.md
@ -36,7 +36,7 @@ In order to run this demo efficiently, you need to have the following:
 - Latest Chrome or Firefox browser (Safari is not supported)
 - Run this on a desktop or laptop with modern CPU (a mobile phone will likely not be good enough)
 - Speak phrases that are no longer than 10 seconds - this is the audio context of the AI
- The web-page uses about 1.6GB of RAM
+- The web-page uses about 1.8GB of RAM

 Notice that this demo is using the smallest GPT-2 model, so the generated text responses are not always very good.
 Also, the prompting strategy can likely be improved to achieve better results.
--- a/examples/talk.wasm/emscripten.cpp
+++ b/examples/talk.wasm/emscripten.cpp
@ -271,7 +271,7 @@ EMSCRIPTEN_BINDINGS(talk) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
+                g_contexts[i] = whisper_init_from_file(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    g_running = true;
                    if (g_worker.joinable()) {
--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@ -8,6 +8,9 @@ if (WHISPER_SUPPORT_SDL2)
    # TODO: this is temporary
    #       need to export ggml symbols for MSVC, but too lazy ..
    add_executable(${TARGET} talk.cpp gpt-2.cpp ../../ggml.c ../../whisper.cpp)
+
+    include(DefaultTargetOptions)
+
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -31,7 +31,7 @@ To run this, you will need a ggml GPT-2 model: [instructions](https://github.com
 Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:

 ```
-wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://ggml.ggerganov.com/ggml-model-gpt-2-117M.bin
+wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
 ```

 ## TTS
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -40,7 +40,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
    // find the longest tokens that form the words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
-        if (word.size() == 0) continue;
+        if (word.empty()) continue;

        int i = 0;
        int n = word.size();
@ -78,7 +78,7 @@ gpt_vocab::id gpt_sample_top_k_top_p(
        const float * logits,
        int    top_k,
        double top_p,
-        double temp,
+        double /*temp*/,
        std::mt19937 & rng) {
    int n_logits = vocab.id_to_token.size();

@ -86,7 +86,7 @@ gpt_vocab::id gpt_sample_top_k_top_p(
    logits_id.reserve(n_logits);

    for (int i = 0; i < n_logits; i++) {
-        logits_id.push_back(std::make_pair(logits[i], i));
+        logits_id.emplace_back(logits[i], i);
    }

    // find the top K tokens
@ -139,7 +139,7 @@ gpt_vocab::id gpt_sample_top_k_top_p(
    }

    //printf("\n");
-    //for (int i = 0; i < (int)logits_id.size(); i++) {
+    //for (int i = 0; i < (int) logits_id.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
    //}
    //exit(0);
@ -268,7 +268,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            fin.read((char *) &len, sizeof(len));

            word.resize(len);
-            fin.read((char *) word.data(), len);
+            fin.read((char *) &word[0], len);

            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
@ -327,7 +327,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    {
        struct ggml_init_params params;
        params.mem_size   = ctx_size;
-        params.mem_buffer = NULL;
+        params.mem_buffer = nullptr;

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -448,7 +448,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            std::string name(length, 0);
            fin.read(&name[0], length);

-            if (model.tensors.find(name.data()) == model.tensors.end()) {
+            if (model.tensors.find(name) == model.tensors.end()) {
                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
                return false;
            }
@ -825,22 +825,23 @@ Me too.
    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());

    // sampling parameters
-    int32_t top_k = 20;
-    float   top_p = 0.98f;
+    int32_t top_k = 5;
+    float   top_p = 0.9f;
    float   temp  = 1.0f;
 };

 struct gpt2_context * gpt2_init(const char * path_model) {
    gpt2_context * ctx = new gpt2_context;

-    ctx->rng = std::mt19937(time(NULL));
+    ctx->rng = std::mt19937(time(nullptr));

    // load the model
    {
        const int64_t t_start_us = ggml_time_us();

        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
+            delete ctx;
            return nullptr;
        }

@ -884,9 +885,9 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)

    std::string result;

-    for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
+    for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
        // predict
-        if (embd.size() > 0) {
+        if (!embd.empty()) {
            if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
                printf("gpt-2: failed to generate text\n");
                return "";
@ -913,10 +914,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
        result += ctx->vocab.id_to_token[embd[0]];

        // end of text token
-        if (embd.back() == 50256 ||
-            ctx->vocab.id_to_token[embd.back()] == "." ||
-            ctx->vocab.id_to_token[embd.back()] == "!" ||
-            ctx->vocab.id_to_token[embd.back()] == "?") {
+        if (embd.back() == 50256) {
            break;
        }
    }
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -39,7 +39,7 @@ struct whisper_params {
    std::string model_wsp = "models/ggml-base.en.bin";
    std::string model_gpt = "models/ggml-gpt-2-117M.bin";
    std::string speak     = "./examples/talk/speak.sh";
-    std::string fname_out = "";
+    std::string fname_out;
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -79,7 +79,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    return true;
 }

-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
@ -397,7 +397,7 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
    float energy_all  = 0.0f;
    float energy_last = 0.0f;

-    for (size_t i = 0; i < n_samples; i++) {
+    for (int i = 0; i < n_samples; i++) {
        energy_all += fabsf(pcmf32[i]);

        if (i >= n_samples - n_samples_last) {
@ -473,56 +473,15 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
    return result;
 }

-// compute similarity between two strings using Levenshtein distance
-float similarity(const std::string & s0, const std::string & s1) {
-    const size_t len0 = s0.size() + 1;
-    const size_t len1 = s1.size() + 1;
+const std::string k_prompt =
+R"(This is a dialogue between {0} (A) and a person (B). The dialogue so far is:

-    std::vector<int> col(len1, 0);
-    std::vector<int> prevCol(len1, 0);
+B: Hello {0}, how are you?
+A: I'm fine, thank you.
+{1}
+Here is how {0} (A) continues the dialogue:

-    for (size_t i = 0; i < len1; i++) {
-        prevCol[i] = i;
-    }
-
-    for (size_t i = 0; i < len0; i++) {
-        col[0] = i;
-        for (size_t j = 1; j < len1; j++) {
-            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
-        }
-        col.swap(prevCol);
-    }
-
-    const float dist = prevCol[len1 - 1];
-
-    return 1.0f - (dist / std::max(s0.size(), s1.size()));
-}
-
-// generated with ChatGPT
-std::map<std::string, std::string> k_prompts = {
-    { "Santa",
-R"(Kid: Hi Santa! Are you real?
-Santa: Of course I am, my dear! Ho ho ho!
-Kid: Can you please bring me a new toy for Christmas?
-Santa: I'll see what I can do, but you have to make sure to be a good boy or girl and listen to your parents.
-Kid: I will, Santa! Thank you!
-Santa: You're welcome, little one. Merry Christmas! Ho ho ho!
-Kid: Can you tell me how you deliver all the presents to all the kids in the world in one night?
-Santa: It's a secret, but I have a lot of help from my elves and my magical sleigh. And I have a special route that I follow to make sure I visit every child.
-Kid: Wow, that's amazing! Can I please have a ride in your sleigh sometime?
-Santa: I'm sorry, but only good boys and girls get to ride in my sleigh.
-)" },
-    { "Kid",
-R"(Kid: Hi Santa! Are you real?
-Santa: Of course I am, my dear! Ho ho ho!
-Kid: Can you please bring me a new toy for Christmas?
-Santa: I'll see what I can do, but you have to make sure to be a good boy or girl and listen to your parents.
-Kid: I will, Santa! Thank you!
-Kid: Can you tell me how you deliver all the presents to all the kids in the world in one night?
-Santa: It's a secret, but I have a lot of help from my elves and my magical sleigh. And I have a special route that I follow to make sure I visit every child.
-Kid: Wow, that's amazing! Can I please have a ride in your sleigh sometime?
-)" },
-};
+A:)";

 int main(int argc, char ** argv) {
    whisper_params params;
@ -539,7 +498,7 @@ int main(int argc, char ** argv) {

    // whisper init

-    struct whisper_context * ctx_wsp = whisper_init(params.model_wsp.c_str());
+    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());

    // gpt init

@ -579,27 +538,20 @@ int main(int argc, char ** argv) {
    int n_iter = 0;

    bool is_running  = true;
-    bool force_speak = params.person == "Kid";
+    bool force_speak = false;

    float prob0 = 0.0f;
-    float prob  = 0.0f;

    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;

-    if (k_prompts.find(params.person) == k_prompts.end()) {
-        fprintf(stderr, "%s: unknown person '%s'\n", __func__, params.person.c_str());
-        return 1;
-    }
+    gpt2_set_prompt(ctx_gpt, "");

-    gpt2_set_prompt(ctx_gpt, k_prompts.at(params.person).c_str());
+    const int voice_id = rand()%6;

-    const std::string person_other = params.person == "Santa" ? "Kid" : "Santa";
-    const int voice_id = params.person == "Santa" ? 5 : 2;
-
-    fprintf(stderr, "gpt-2: prompt_base:\n");
+    fprintf(stderr, "gpt-2: prompt:\n");
    fprintf(stderr, "========================\n\n");
-    fprintf(stderr, "%s\n", gpt2_get_prompt(ctx_gpt));
+    fprintf(stderr, "%s\n", ::replace(k_prompt, "{0}", params.person).c_str());
    fprintf(stderr, "========================\n\n");

    // main loop
@ -636,13 +588,12 @@ int main(int argc, char ** argv) {

                audio.get(params.voice_ms, pcmf32_cur);

-                std::string text_heard = "Hey little one, what do you want for Christmas?";
+                std::string text_heard;
+
                if (!force_speak) {
                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
                }

-                force_speak = false;
-
                // remove text between brackets using regex
                {
                    std::regex re("\\[.*?\\]");
@ -659,7 +610,7 @@ int main(int argc, char ** argv) {
                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");

                // take first line
-                text_heard = text_heard.substr(0, text_heard.find_first_of("\n"));
+                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));

                // remove leading and trailing whitespace
                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
@ -667,13 +618,15 @@ int main(int argc, char ** argv) {

                const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(ctx_gpt, text_heard.c_str());

-                if (text_heard.empty() || tokens.empty()) {
+                if (text_heard.empty() || tokens.empty() || force_speak) {
                    fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
                    audio.clear();

                    continue;
                }

+                force_speak = false;
+
                fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", text_heard.c_str(), "\033[0m", (int) t_ms);

                std::string prompt_base = gpt2_get_prompt(ctx_gpt);
@ -681,35 +634,44 @@ int main(int argc, char ** argv) {
                std::string text_to_speak;

                {
-                    text_heard = person_other + ": " + text_heard;
+                    prompt_base += "B: " + text_heard + "\n";

-                    text_to_speak = gpt2_gen_text(ctx_gpt, (prompt_base + text_heard + "\n").c_str(), params.max_tokens);
+                    std::string prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
+
+                    text_to_speak = gpt2_gen_text(ctx_gpt, prompt.c_str(), params.max_tokens);
                    text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-                    text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
+                    text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of('\n'));

                    // remove first 2 lines of base prompt
                    if (n_iter > 4) {
                        {
-                            const size_t pos = prompt_base.find_first_of("\n");
+                            const size_t pos = prompt_base.find_first_of('\n');
                            if (pos != std::string::npos) {
                                prompt_base = prompt_base.substr(pos + 1);
                            }
                        }
                        {
-                            const size_t pos = prompt_base.find_first_of("\n");
+                            const size_t pos = prompt_base.find_first_of('\n');
                            if (pos != std::string::npos) {
                                prompt_base = prompt_base.substr(pos + 1);
                            }
                        }
                    }

-                    prompt_base += text_heard + "\n" + text_to_speak + "\n";
+                    prompt_base += "A:" + text_to_speak + "\n";
+
+                    {
+                        prompt = ::replace(::replace(k_prompt, "{0}", params.person), "{1}", prompt_base);
+
+                        printf("===============\n");
+                        printf("prompt:\n");
+                        printf("%s\n", prompt.c_str());
+                        printf("===============\n");
+                    }
                }

-                printf("%s\n", text_to_speak.c_str());
-
                //printf("========================\n");
-                //printf("gpt-2: prompt_base:\n'%s'\n", prompt_base.c_str());
+                //printf("gpt-2: prompt_base:\n%s\n", prompt_base.c_str());
                //printf("========================\n");

                gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
--- a/examples/whisper.android/.gitignore
+++ b/examples/whisper.android/.gitignore
@ -0,0 +1,15 @@
+*.iml
+.gradle
+/local.properties
+/.idea/caches
+/.idea/libraries
+/.idea/modules.xml
+/.idea/workspace.xml
+/.idea/navEditor.xml
+/.idea/assetWizardSettings.xml
+.DS_Store
+/build
+/captures
+.externalNativeBuild
+.cxx
+local.properties
--- a/examples/whisper.android/.idea/.gitignore
+++ b/examples/whisper.android/.idea/.gitignore
@ -0,0 +1,3 @@
+# Default ignored files
+/shelf/
+/workspace.xml
--- a/examples/whisper.android/.idea/.name
+++ b/examples/whisper.android/.idea/.name
@ -0,0 +1 @@
+WhisperCppDemo
--- a/examples/whisper.android/.idea/compiler.xml
+++ b/examples/whisper.android/.idea/compiler.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="CompilerConfiguration">
+    <bytecodeTargetLevel target="11" />
+  </component>
+</project>
--- a/examples/whisper.android/.idea/gradle.xml
+++ b/examples/whisper.android/.idea/gradle.xml
@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="GradleMigrationSettings" migrationVersion="1" />
+  <component name="GradleSettings">
+    <option name="linkedExternalProjectsSettings">
+      <GradleProjectSettings>
+        <option name="testRunner" value="GRADLE" />
+        <option name="distributionType" value="DEFAULT_WRAPPED" />
+        <option name="externalProjectPath" value="$PROJECT_DIR$" />
+        <option name="modules">
+          <set>
+            <option value="$PROJECT_DIR$" />
+            <option value="$PROJECT_DIR$/app" />
+          </set>
+        </option>
+      </GradleProjectSettings>
+    </option>
+  </component>
+</project>
--- a/examples/whisper.android/.idea/misc.xml
+++ b/examples/whisper.android/.idea/misc.xml
@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ExternalStorageConfigurationManager" enabled="true" />
+  <component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="Android Studio default JDK" project-jdk-type="JavaSDK">
+    <output url="file://$PROJECT_DIR$/build/classes" />
+  </component>
+  <component name="ProjectType">
+    <option name="id" value="Android" />
+  </component>
+</project>
--- a/examples/whisper.android/.idea/vcs.xml
+++ b/examples/whisper.android/.idea/vcs.xml
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$/../.." vcs="Git" />
+  </component>
+</project>
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -0,0 +1,12 @@
+A sample Android app using [whisper.cpp](https://github.com/ggerganov/whisper.cpp/) to do voice-to-text transcriptions.
+
+To use:
+
+1. Select a model from the [whisper.cpp repository](https://github.com/ggerganov/whisper.cpp/tree/master/models).[^1]
+2. Copy the model to the "app/src/main/assets/models" folder.
+3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
+4. Copy the sample to the "app/src/main/assets/samples" folder.
+5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
+[^1]: I recommend the tiny or base models for running on an Android device.
+
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/1991296/208154256-82d972dc-221b-48c4-bfcb-36ce68602f93.png">
--- a/examples/whisper.android/app/.gitignore
+++ b/examples/whisper.android/app/.gitignore
@ -0,0 +1 @@
+/build
--- a/examples/whisper.android/app/build.gradle
+++ b/examples/whisper.android/app/build.gradle
@ -0,0 +1,72 @@
+plugins {
+    id 'com.android.application'
+    id 'org.jetbrains.kotlin.android'
+}
+
+android {
+    namespace 'com.whispercppdemo'
+    compileSdk 33
+
+    defaultConfig {
+        applicationId "com.whispercppdemo"
+        minSdk 26
+        targetSdk 32
+        versionCode 1
+        versionName "1.0"
+
+        testInstrumentationRunner "androidx.test.runner.AndroidJUnitRunner"
+        vectorDrawables {
+            useSupportLibrary true
+        }
+    }
+
+    buildTypes {
+        release {
+            signingConfig signingConfigs.debug
+            minifyEnabled true
+            proguardFiles getDefaultProguardFile('proguard-android-optimize.txt'), 'proguard-rules.pro'
+        }
+    }
+    compileOptions {
+        sourceCompatibility JavaVersion.VERSION_1_8
+        targetCompatibility JavaVersion.VERSION_1_8
+    }
+    kotlinOptions {
+        jvmTarget = '1.8'
+    }
+    buildFeatures {
+        compose true
+    }
+    composeOptions {
+        kotlinCompilerExtensionVersion '1.3.1'
+    }
+    ndkVersion "25.1.8937393"
+    externalNativeBuild {
+        ndkBuild {
+            path 'src/main/jni/whisper/Android.mk'
+        }
+    }
+    packagingOptions {
+        resources {
+            excludes += '/META-INF/{AL2.0,LGPL2.1}'
+        }
+    }
+}
+
+dependencies {
+    implementation 'androidx.activity:activity-compose:1.6.1'
+    implementation 'androidx.compose.material:material-icons-core:1.3.1'
+    implementation 'androidx.compose.material3:material3:1.0.1'
+    implementation "androidx.compose.ui:ui:1.3.2"
+    implementation "androidx.compose.ui:ui-tooling-preview:1.3.2"
+    implementation 'androidx.lifecycle:lifecycle-viewmodel-compose:2.5.1'
+    implementation "com.google.accompanist:accompanist-permissions:0.28.0"
+    implementation 'org.jetbrains.kotlinx:kotlinx-coroutines-core:1.6.4'
+
+    testImplementation 'junit:junit:4.13.2'
+    androidTestImplementation 'androidx.test.ext:junit:1.1.4'
+    androidTestImplementation 'androidx.test.espresso:espresso-core:3.5.0'
+    androidTestImplementation "androidx.compose.ui:ui-test-junit4:1.3.2"
+    debugImplementation "androidx.compose.ui:ui-tooling:1.3.2"
+    debugImplementation "androidx.compose.ui:ui-test-manifest:1.3.2"
+}
--- a/examples/whisper.android/app/proguard-rules.pro
+++ b/examples/whisper.android/app/proguard-rules.pro
@ -0,0 +1,21 @@
+# Add project specific ProGuard rules here.
+# You can control the set of applied configuration files using the
+# proguardFiles setting in build.gradle.
+#
+# For more details, see
+#   http://developer.android.com/guide/developing/tools/proguard.html
+
+# If your project uses WebView with JS, uncomment the following
+# and specify the fully qualified class name to the JavaScript interface
+# class:
+#-keepclassmembers class fqcn.of.javascript.interface.for.webview {
+#   public *;
+#}
+
+# Uncomment this to preserve the line number information for
+# debugging stack traces.
+#-keepattributes SourceFile,LineNumberTable
+
+# If you keep the line number information, uncomment this to
+# hide the original source file name.
+#-renamesourcefileattribute SourceFile
--- a/examples/whisper.android/app/src/androidTest/java/com/whispercppdemo/ExampleInstrumentedTest.kt
+++ b/examples/whisper.android/app/src/androidTest/java/com/whispercppdemo/ExampleInstrumentedTest.kt
@ -0,0 +1,24 @@
+package com.whispercppdemo
+
+import androidx.test.platform.app.InstrumentationRegistry
+import androidx.test.ext.junit.runners.AndroidJUnit4
+
+import org.junit.Test
+import org.junit.runner.RunWith
+
+import org.junit.Assert.*
+
+/**
+ * Instrumented test, which will execute on an Android device.
+ *
+ * See [testing documentation](http://d.android.com/tools/testing).
+ */
+@RunWith(AndroidJUnit4::class)
+class ExampleInstrumentedTest {
+    @Test
+    fun useAppContext() {
+        // Context of the app under test.
+        val appContext = InstrumentationRegistry.getInstrumentation().targetContext
+        assertEquals("com.whispercppdemo", appContext.packageName)
+    }
+}
--- a/examples/whisper.android/app/src/main/AndroidManifest.xml
+++ b/examples/whisper.android/app/src/main/AndroidManifest.xml
@ -0,0 +1,32 @@
+<?xml version="1.0" encoding="utf-8"?>
+<manifest xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:tools="http://schemas.android.com/tools">
+
+    <uses-permission android:name="android.permission.RECORD_AUDIO" />
+
+    <application
+        android:allowBackup="true"
+        android:dataExtractionRules="@xml/data_extraction_rules"
+        android:fullBackupContent="@xml/backup_rules"
+        android:icon="@mipmap/ic_launcher"
+        android:label="@string/app_name"
+        android:supportsRtl="true"
+        android:theme="@style/Theme.WhisperCppDemo"
+        tools:targetApi="31">
+        <activity
+            android:name=".MainActivity"
+            android:exported="true"
+            android:theme="@style/Theme.WhisperCppDemo">
+            <intent-filter>
+                <action android:name="android.intent.action.MAIN" />
+
+                <category android:name="android.intent.category.LAUNCHER" />
+            </intent-filter>
+
+            <meta-data
+                android:name="android.app.lib_name"
+                android:value="" />
+        </activity>
+    </application>
+
+</manifest>
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/MainActivity.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/MainActivity.kt
@ -0,0 +1,22 @@
+package com.whispercppdemo
+
+import android.os.Bundle
+import androidx.activity.ComponentActivity
+import androidx.activity.compose.setContent
+import androidx.activity.viewModels
+import com.whispercppdemo.ui.main.MainScreen
+import com.whispercppdemo.ui.main.MainScreenViewModel
+import com.whispercppdemo.ui.theme.WhisperCppDemoTheme
+
+class MainActivity : ComponentActivity() {
+    private val viewModel: MainScreenViewModel by viewModels { MainScreenViewModel.factory() }
+
+    override fun onCreate(savedInstanceState: Bundle?) {
+        super.onCreate(savedInstanceState)
+        setContent {
+            WhisperCppDemoTheme {
+                MainScreen(viewModel)
+            }
+        }
+    }
+}
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/media/RiffWaveHelper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/media/RiffWaveHelper.kt
@ -0,0 +1,76 @@
+package com.whispercppdemo.media
+
+import java.io.ByteArrayOutputStream
+import java.io.File
+import java.nio.ByteBuffer
+import java.nio.ByteOrder
+
+fun decodeWaveFile(file: File): FloatArray {
+    val baos = ByteArrayOutputStream()
+    file.inputStream().use { it.copyTo(baos) }
+    val buffer = ByteBuffer.wrap(baos.toByteArray())
+    buffer.order(ByteOrder.LITTLE_ENDIAN)
+    buffer.position(44)
+    val shortBuffer = buffer.asShortBuffer()
+    val shortArray = ShortArray(shortBuffer.limit())
+    shortBuffer.get(shortArray)
+    return FloatArray(shortArray.size) { index ->
+        (shortArray[index] / 32767.0f).coerceIn(-1f..1f)
+    }
+}
+
+fun encodeWaveFile(file: File, data: ShortArray) {
+    file.outputStream().use {
+        it.write(headerBytes(data.size * 2))
+        val buffer = ByteBuffer.allocate(data.size * 2)
+        buffer.order(ByteOrder.LITTLE_ENDIAN)
+        buffer.asShortBuffer().put(data)
+        val bytes = ByteArray(buffer.limit())
+        buffer.get(bytes)
+        it.write(bytes)
+    }
+}
+
+private fun headerBytes(totalLength: Int): ByteArray {
+    require(totalLength >= 44)
+    ByteBuffer.allocate(44).apply {
+        order(ByteOrder.LITTLE_ENDIAN)
+
+        put('R'.code.toByte())
+        put('I'.code.toByte())
+        put('F'.code.toByte())
+        put('F'.code.toByte())
+
+        putInt(totalLength - 8)
+
+        put('W'.code.toByte())
+        put('A'.code.toByte())
+        put('V'.code.toByte())
+        put('E'.code.toByte())
+
+        put('f'.code.toByte())
+        put('m'.code.toByte())
+        put('t'.code.toByte())
+        put(' '.code.toByte())
+
+        putInt(16)
+        putShort(1.toShort())
+        putShort(1.toShort())
+        putInt(16000)
+        putInt(32000)
+        putShort(2.toShort())
+        putShort(16.toShort())
+
+        put('d'.code.toByte())
+        put('a'.code.toByte())
+        put('t'.code.toByte())
+        put('a'.code.toByte())
+
+        putInt(totalLength - 44)
+        position(0)
+    }.also {
+        val bytes = ByteArray(it.limit())
+        it.get(bytes)
+        return bytes
+    }
+}
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/recorder/Recorder.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/recorder/Recorder.kt
@ -0,0 +1,88 @@
+package com.whispercppdemo.recorder
+
+import android.annotation.SuppressLint
+import android.media.AudioFormat
+import android.media.AudioRecord
+import android.media.MediaRecorder
+import com.whispercppdemo.media.encodeWaveFile
+import kotlinx.coroutines.CoroutineScope
+import kotlinx.coroutines.asCoroutineDispatcher
+import kotlinx.coroutines.withContext
+import java.io.File
+import java.util.concurrent.Executors
+import java.util.concurrent.atomic.AtomicBoolean
+
+class Recorder {
+    private val scope: CoroutineScope = CoroutineScope(
+        Executors.newSingleThreadExecutor().asCoroutineDispatcher()
+    )
+    private var recorder: AudioRecordThread? = null
+
+    suspend fun startRecording(outputFile: File, onError: (Exception) -> Unit) = withContext(scope.coroutineContext) {
+        recorder = AudioRecordThread(outputFile, onError)
+        recorder?.start()
+    }
+
+    suspend fun stopRecording() = withContext(scope.coroutineContext) {
+        recorder?.stopRecording()
+        @Suppress("BlockingMethodInNonBlockingContext")
+        recorder?.join()
+        recorder = null
+    }
+}
+
+private class AudioRecordThread(
+    private val outputFile: File,
+    private val onError: (Exception) -> Unit
+) :
+    Thread("AudioRecorder") {
+    private var quit = AtomicBoolean(false)
+
+    @SuppressLint("MissingPermission")
+    override fun run() {
+        try {
+            val bufferSize = AudioRecord.getMinBufferSize(
+                16000,
+                AudioFormat.CHANNEL_IN_MONO,
+                AudioFormat.ENCODING_PCM_16BIT
+            ) * 4
+            val buffer = ShortArray(bufferSize / 2)
+
+            val audioRecord = AudioRecord(
+                MediaRecorder.AudioSource.MIC,
+                16000,
+                AudioFormat.CHANNEL_IN_MONO,
+                AudioFormat.ENCODING_PCM_16BIT,
+                bufferSize
+            )
+
+            try {
+                audioRecord.startRecording()
+
+                val allData = mutableListOf<Short>()
+
+                while (!quit.get()) {
+                    val read = audioRecord.read(buffer, 0, buffer.size)
+                    if (read > 0) {
+                        for (i in 0 until read) {
+                            allData.add(buffer[i])
+                        }
+                    } else {
+                        throw java.lang.RuntimeException("audioRecord.read returned $read")
+                    }
+                }
+
+                audioRecord.stop()
+                encodeWaveFile(outputFile, allData.toShortArray())
+            } finally {
+                audioRecord.release()
+            }
+        } catch (e: Exception) {
+            onError(e)
+        }
+    }
+
+    fun stopRecording() {
+        quit.set(true)
+    }
+}
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
@ -0,0 +1,99 @@
+package com.whispercppdemo.ui.main
+
+import androidx.compose.foundation.layout.*
+import androidx.compose.foundation.rememberScrollState
+import androidx.compose.foundation.verticalScroll
+import androidx.compose.material3.*
+import androidx.compose.runtime.Composable
+import androidx.compose.ui.Modifier
+import androidx.compose.ui.res.stringResource
+import androidx.compose.ui.unit.dp
+import com.google.accompanist.permissions.ExperimentalPermissionsApi
+import com.google.accompanist.permissions.isGranted
+import com.google.accompanist.permissions.rememberPermissionState
+import com.whispercppdemo.R
+
+@Composable
+fun MainScreen(viewModel: MainScreenViewModel) {
+    MainScreen(
+        canTranscribe = viewModel.canTranscribe,
+        isRecording = viewModel.isRecording,
+        messageLog = viewModel.dataLog,
+        onTranscribeSampleTapped = viewModel::transcribeSample,
+        onRecordTapped = viewModel::toggleRecord
+    )
+}
+
+@OptIn(ExperimentalMaterial3Api::class)
+@Composable
+private fun MainScreen(
+    canTranscribe: Boolean,
+    isRecording: Boolean,
+    messageLog: String,
+    onTranscribeSampleTapped: () -> Unit,
+    onRecordTapped: () -> Unit
+) {
+    Scaffold(
+        topBar = {
+            TopAppBar(
+                title = { Text(stringResource(R.string.app_name)) }
+            )
+        },
+    ) { innerPadding ->
+        Column(
+            modifier = Modifier
+                .padding(innerPadding)
+                .padding(16.dp)
+        ) {
+            Row(horizontalArrangement = Arrangement.SpaceBetween) {
+                TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
+                RecordButton(
+                    enabled = canTranscribe,
+                    isRecording = isRecording,
+                    onClick = onRecordTapped
+                )
+            }
+            MessageLog(messageLog)
+        }
+    }
+}
+
+@Composable
+private fun MessageLog(log: String) {
+    Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
+}
+
+@Composable
+private fun TranscribeSampleButton(enabled: Boolean, onClick: () -> Unit) {
+    Button(onClick = onClick, enabled = enabled) {
+        Text("Transcribe sample")
+    }
+}
+
+@OptIn(ExperimentalPermissionsApi::class)
+@Composable
+private fun RecordButton(enabled: Boolean, isRecording: Boolean, onClick: () -> Unit) {
+    val micPermissionState = rememberPermissionState(
+        permission = android.Manifest.permission.RECORD_AUDIO,
+        onPermissionResult = { granted ->
+            if (granted) {
+                onClick()
+            }
+        }
+    )
+    Button(onClick = {
+        if (micPermissionState.status.isGranted) {
+            onClick()
+        } else {
+            micPermissionState.launchPermissionRequest()
+        }
+     }, enabled = enabled) {
+        Text(
+            if (isRecording) {
+                "Stop recording"
+            } else {
+                "Start recording"
+            }
+        )
+    }
+}
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
@ -0,0 +1,198 @@
+package com.whispercppdemo.ui.main
+
+import android.app.Application
+import android.content.Context
+import android.media.MediaPlayer
+import android.util.Log
+import androidx.compose.runtime.getValue
+import androidx.compose.runtime.mutableStateOf
+import androidx.compose.runtime.setValue
+import androidx.core.net.toUri
+import androidx.lifecycle.ViewModel
+import androidx.lifecycle.ViewModelProvider
+import androidx.lifecycle.viewModelScope
+import androidx.lifecycle.viewmodel.initializer
+import androidx.lifecycle.viewmodel.viewModelFactory
+import com.whispercppdemo.media.decodeWaveFile
+import com.whispercppdemo.recorder.Recorder
+import com.whispercppdemo.whisper.WhisperContext
+import kotlinx.coroutines.Dispatchers
+import kotlinx.coroutines.launch
+import kotlinx.coroutines.runBlocking
+import kotlinx.coroutines.withContext
+import java.io.File
+
+private const val LOG_TAG = "MainScreenViewModel"
+
+class MainScreenViewModel(private val application: Application) : ViewModel() {
+    var canTranscribe by mutableStateOf(false)
+        private set
+    var dataLog by mutableStateOf("")
+        private set
+    var isRecording by mutableStateOf(false)
+        private set
+
+    private val modelsPath = File(application.filesDir, "models")
+    private val samplesPath = File(application.filesDir, "samples")
+    private var recorder: Recorder = Recorder()
+    private var whisperContext: WhisperContext? = null
+    private var mediaPlayer: MediaPlayer? = null
+    private var recordedFile: File? = null
+
+    init {
+        viewModelScope.launch {
+            loadData()
+        }
+    }
+
+    private suspend fun loadData() {
+        printMessage("Loading data...\n")
+        try {
+            copyAssets()
+            loadBaseModel()
+            canTranscribe = true
+        } catch (e: Exception) {
+            Log.w(LOG_TAG, e)
+            printMessage("${e.localizedMessage}\n")
+        }
+    }
+
+    private suspend fun printMessage(msg: String) = withContext(Dispatchers.Main) {
+        dataLog += msg
+    }
+
+    private suspend fun copyAssets() = withContext(Dispatchers.IO) {
+        modelsPath.mkdirs()
+        samplesPath.mkdirs()
+        //application.copyData("models", modelsPath, ::printMessage)
+        application.copyData("samples", samplesPath, ::printMessage)
+        printMessage("All data copied to working directory.\n")
+    }
+
+    private suspend fun loadBaseModel() = withContext(Dispatchers.IO) {
+        printMessage("Loading model...\n")
+        val models = application.assets.list("models/")
+        if (models != null) {
+            whisperContext = WhisperContext.createContextFromAsset(application.assets, "models/" + models[0])
+            printMessage("Loaded model ${models[0]}.\n")
+        }
+
+        //val firstModel = modelsPath.listFiles()!!.first()
+        //whisperContext = WhisperContext.createContextFromFile(firstModel.absolutePath)
+    }
+
+    fun transcribeSample() = viewModelScope.launch {
+        transcribeAudio(getFirstSample())
+    }
+
+    private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) {
+        samplesPath.listFiles()!!.first()
+    }
+
+    private suspend fun readAudioSamples(file: File): FloatArray = withContext(Dispatchers.IO) {
+        stopPlayback()
+        startPlayback(file)
+        return@withContext decodeWaveFile(file)
+    }
+
+    private suspend fun stopPlayback() = withContext(Dispatchers.Main) {
+        mediaPlayer?.stop()
+        mediaPlayer?.release()
+        mediaPlayer = null
+    }
+
+    private suspend fun startPlayback(file: File) = withContext(Dispatchers.Main) {
+        mediaPlayer = MediaPlayer.create(application, file.absolutePath.toUri())
+        mediaPlayer?.start()
+    }
+
+    private suspend fun transcribeAudio(file: File) {
+        if (!canTranscribe) {
+            return
+        }
+
+        canTranscribe = false
+
+        try {
+            printMessage("Reading wave samples...\n")
+            val data = readAudioSamples(file)
+            printMessage("Transcribing data...\n")
+            val text = whisperContext?.transcribeData(data)
+            printMessage("Done: $text\n")
+        } catch (e: Exception) {
+            Log.w(LOG_TAG, e)
+            printMessage("${e.localizedMessage}\n")
+        }
+
+        canTranscribe = true
+    }
+
+    fun toggleRecord() = viewModelScope.launch {
+        try {
+            if (isRecording) {
+                recorder.stopRecording()
+                isRecording = false
+                recordedFile?.let { transcribeAudio(it) }
+            } else {
+                stopPlayback()
+                val file = getTempFileForRecording()
+                recorder.startRecording(file) { e ->
+                    viewModelScope.launch {
+                        withContext(Dispatchers.Main) {
+                            printMessage("${e.localizedMessage}\n")
+                            isRecording = false
+                        }
+                    }
+                }
+                isRecording = true
+                recordedFile = file
+            }
+        } catch (e: Exception) {
+            Log.w(LOG_TAG, e)
+            printMessage("${e.localizedMessage}\n")
+            isRecording = false
+        }
+    }
+
+    private suspend fun getTempFileForRecording() = withContext(Dispatchers.IO) {
+        File.createTempFile("recording", "wav")
+    }
+
+    override fun onCleared() {
+        runBlocking {
+            whisperContext?.release()
+            whisperContext = null
+            stopPlayback()
+        }
+    }
+
+    companion object {
+        fun factory() = viewModelFactory {
+            initializer {
+                val application =
+                    this[ViewModelProvider.AndroidViewModelFactory.APPLICATION_KEY] as Application
+                MainScreenViewModel(application)
+            }
+        }
+    }
+}
+
+private suspend fun Context.copyData(
+    assetDirName: String,
+    destDir: File,
+    printMessage: suspend (String) -> Unit
+) = withContext(Dispatchers.IO) {
+    assets.list(assetDirName)?.forEach { name ->
+        val assetPath = "$assetDirName/$name"
+        Log.v(LOG_TAG, "Processing $assetPath...")
+        val destination = File(destDir, name)
+        Log.v(LOG_TAG, "Copying $assetPath to $destination...")
+        printMessage("Copying $name...\n")
+        assets.open(assetPath).use { input ->
+            destination.outputStream().use { output ->
+                input.copyTo(output)
+            }
+        }
+        Log.v(LOG_TAG, "Copied $assetPath to $destination")
+    }
+}
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Color.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Color.kt
@ -0,0 +1,11 @@
+package com.whispercppdemo.ui.theme
+
+import androidx.compose.ui.graphics.Color
+
+val Purple80 = Color(0xFFD0BCFF)
+val PurpleGrey80 = Color(0xFFCCC2DC)
+val Pink80 = Color(0xFFEFB8C8)
+
+val Purple40 = Color(0xFF6650a4)
+val PurpleGrey40 = Color(0xFF625b71)
+val Pink40 = Color(0xFF7D5260)
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Theme.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Theme.kt
@ -0,0 +1,68 @@
+package com.whispercppdemo.ui.theme
+
+import android.app.Activity
+import android.os.Build
+import androidx.compose.foundation.isSystemInDarkTheme
+import androidx.compose.material3.MaterialTheme
+import androidx.compose.material3.darkColorScheme
+import androidx.compose.material3.dynamicDarkColorScheme
+import androidx.compose.material3.dynamicLightColorScheme
+import androidx.compose.material3.lightColorScheme
+import androidx.compose.runtime.Composable
+import androidx.compose.runtime.SideEffect
+import androidx.compose.ui.graphics.toArgb
+import androidx.compose.ui.platform.LocalContext
+import androidx.compose.ui.platform.LocalView
+import androidx.core.view.ViewCompat
+
+private val DarkColorScheme = darkColorScheme(
+    primary = Purple80,
+    secondary = PurpleGrey80,
+    tertiary = Pink80
+)
+
+private val LightColorScheme = lightColorScheme(
+    primary = Purple40,
+    secondary = PurpleGrey40,
+    tertiary = Pink40
+
+    /* Other default colors to override
+    background = Color(0xFFFFFBFE),
+    surface = Color(0xFFFFFBFE),
+    onPrimary = Color.White,
+    onSecondary = Color.White,
+    onTertiary = Color.White,
+    onBackground = Color(0xFF1C1B1F),
+    onSurface = Color(0xFF1C1B1F),
+    */
+)
+
+@Composable
+fun WhisperCppDemoTheme(
+    darkTheme: Boolean = isSystemInDarkTheme(),
+    // Dynamic color is available on Android 12+
+    dynamicColor: Boolean = true,
+    content: @Composable () -> Unit
+) {
+    val colorScheme = when {
+        dynamicColor && Build.VERSION.SDK_INT >= Build.VERSION_CODES.S -> {
+            val context = LocalContext.current
+            if (darkTheme) dynamicDarkColorScheme(context) else dynamicLightColorScheme(context)
+        }
+        darkTheme -> DarkColorScheme
+        else -> LightColorScheme
+    }
+    val view = LocalView.current
+    if (!view.isInEditMode) {
+        SideEffect {
+            (view.context as Activity).window.statusBarColor = colorScheme.primary.toArgb()
+            ViewCompat.getWindowInsetsController(view)?.isAppearanceLightStatusBars = darkTheme
+        }
+    }
+
+    MaterialTheme(
+        colorScheme = colorScheme,
+        typography = Typography,
+        content = content
+    )
+}
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Type.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/theme/Type.kt
@ -0,0 +1,34 @@
+package com.whispercppdemo.ui.theme
+
+import androidx.compose.material3.Typography
+import androidx.compose.ui.text.TextStyle
+import androidx.compose.ui.text.font.FontFamily
+import androidx.compose.ui.text.font.FontWeight
+import androidx.compose.ui.unit.sp
+
+// Set of Material typography styles to start with
+val Typography = Typography(
+    bodyLarge = TextStyle(
+        fontFamily = FontFamily.Default,
+        fontWeight = FontWeight.Normal,
+        fontSize = 16.sp,
+        lineHeight = 24.sp,
+        letterSpacing = 0.5.sp
+    )
+    /* Other default text styles to override
+    titleLarge = TextStyle(
+        fontFamily = FontFamily.Default,
+        fontWeight = FontWeight.Normal,
+        fontSize = 22.sp,
+        lineHeight = 28.sp,
+        letterSpacing = 0.sp
+    ),
+    labelSmall = TextStyle(
+        fontFamily = FontFamily.Default,
+        fontWeight = FontWeight.Medium,
+        fontSize = 11.sp,
+        lineHeight = 16.sp,
+        letterSpacing = 0.5.sp
+    )
+    */
+)
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
@ -0,0 +1,122 @@
+package com.whispercppdemo.whisper
+
+import android.content.res.AssetManager
+import android.os.Build
+import android.util.Log
+import kotlinx.coroutines.*
+import java.io.File
+import java.io.InputStream
+import java.util.concurrent.Executors
+
+private const val LOG_TAG = "LibWhisper"
+
+class WhisperContext private constructor(private var ptr: Long) {
+    // Meet Whisper C++ constraint: Don't access from more than one thread at a time.
+    private val scope: CoroutineScope = CoroutineScope(
+        Executors.newSingleThreadExecutor().asCoroutineDispatcher()
+    )
+
+    suspend fun transcribeData(data: FloatArray): String = withContext(scope.coroutineContext) {
+        require(ptr != 0L)
+        WhisperLib.fullTranscribe(ptr, data)
+        val textCount = WhisperLib.getTextSegmentCount(ptr)
+        return@withContext buildString {
+            for (i in 0 until textCount) {
+                append(WhisperLib.getTextSegment(ptr, i))
+            }
+        }
+    }
+
+    suspend fun release() = withContext(scope.coroutineContext) {
+        if (ptr != 0L) {
+            WhisperLib.freeContext(ptr)
+            ptr = 0
+        }
+    }
+
+    protected fun finalize() {
+        runBlocking {
+            release()
+        }
+    }
+
+    companion object {
+        fun createContextFromFile(filePath: String): WhisperContext {
+            val ptr = WhisperLib.initContext(filePath)
+            if (ptr == 0L) {
+                throw java.lang.RuntimeException("Couldn't create context with path $filePath")
+            }
+            return WhisperContext(ptr)
+        }
+
+        fun createContextFromInputStream(stream: InputStream): WhisperContext {
+            val ptr = WhisperLib.initContextFromInputStream(stream)
+
+            if (ptr == 0L) {
+                throw java.lang.RuntimeException("Couldn't create context from input stream")
+            }
+            return WhisperContext(ptr)
+        }
+
+        fun createContextFromAsset(assetManager: AssetManager, assetPath: String): WhisperContext {
+            val ptr = WhisperLib.initContextFromAsset(assetManager, assetPath)
+
+            if (ptr == 0L) {
+                throw java.lang.RuntimeException("Couldn't create context from asset $assetPath")
+            }
+            return WhisperContext(ptr)
+        }
+    }
+}
+
+private class WhisperLib {
+    companion object {
+        init {
+            Log.d(LOG_TAG, "Primary ABI: ${Build.SUPPORTED_ABIS[0]}")
+            var loadVfpv4 = false
+            if (isArmEabiV7a()) {
+                // armeabi-v7a needs runtime detection support
+                val cpuInfo = cpuInfo()
+                cpuInfo?.let {
+                    Log.d(LOG_TAG, "CPU info: $cpuInfo")
+                    if (cpuInfo.contains("vfpv4")) {
+                        Log.d(LOG_TAG, "CPU supports vfpv4")
+                        loadVfpv4 = true
+                    }
+                }
+            }
+
+            if (loadVfpv4) {
+                Log.d(LOG_TAG, "Loading libwhisper_vfpv4.so")
+                System.loadLibrary("whisper_vfpv4")
+            } else {
+                Log.d(LOG_TAG, "Loading libwhisper.so")
+                System.loadLibrary("whisper")
+            }
+        }
+
+        // JNI methods
+        external fun initContextFromInputStream(inputStream: InputStream): Long
+        external fun initContextFromAsset(assetManager: AssetManager, assetPath: String): Long
+        external fun initContext(modelPath: String): Long
+        external fun freeContext(contextPtr: Long)
+        external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
+        external fun getTextSegmentCount(contextPtr: Long): Int
+        external fun getTextSegment(contextPtr: Long, index: Int): String
+    }
+}
+
+private fun isArmEabiV7a(): Boolean {
+    return Build.SUPPORTED_ABIS[0].equals("armeabi-v7a")
+}
+
+private fun cpuInfo(): String? {
+    return try {
+        File("/proc/cpuinfo").inputStream().bufferedReader().use {
+            it.readText()
+        }
+    } catch (e: Exception) {
+        Log.w(LOG_TAG, "Couldn't read /proc/cpuinfo", e)
+        null
+    }
+}
--- a/examples/whisper.android/app/src/main/jni/whisper/Android.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Android.mk
@ -0,0 +1,15 @@
+LOCAL_PATH := $(call my-dir)
+include $(CLEAR_VARS)
+LOCAL_MODULE    := libwhisper
+include $(LOCAL_PATH)/Whisper.mk
+include $(BUILD_SHARED_LIBRARY)
+
+ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
+	include $(CLEAR_VARS)
+	LOCAL_MODULE    := libwhisper_vfpv4
+	include $(LOCAL_PATH)/Whisper.mk
+	# Allow building NEON FMA code.
+	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
+	LOCAL_CFLAGS += -mfpu=neon-vfpv4
+	include $(BUILD_SHARED_LIBRARY)
+endif
--- a/examples/whisper.android/app/src/main/jni/whisper/Application.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Application.mk
@ -0,0 +1 @@
+APP_STL := c++_static
--- a/examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
@ -0,0 +1,18 @@
+WHISPER_LIB_DIR := $(LOCAL_PATH)/../../../../../../../
+LOCAL_LDLIBS    := -landroid -llog
+
+# Make the final output library smaller by only keeping the symbols referenced from the app.
+ifneq ($(APP_OPTIM),debug)
+    LOCAL_CFLAGS += -O3
+    LOCAL_CFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
+    LOCAL_CFLAGS += -ffunction-sections -fdata-sections
+    LOCAL_LDFLAGS += -Wl,--gc-sections
+    LOCAL_LDFLAGS += -Wl,--exclude-libs,ALL
+    LOCAL_LDFLAGS += -flto
+endif
+
+LOCAL_CFLAGS    += -DSTDC_HEADERS -std=c11 -I $(WHISPER_LIB_DIR)
+LOCAL_CPPFLAGS  += -std=c++11
+LOCAL_SRC_FILES := $(WHISPER_LIB_DIR)/ggml.c \
+                   $(WHISPER_LIB_DIR)/whisper.cpp \
+                   $(LOCAL_PATH)/jni.c
--- a/examples/whisper.android/app/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c
@ -0,0 +1,216 @@
+#include <jni.h>
+#include <android/asset_manager.h>
+#include <android/asset_manager_jni.h>
+#include <android/log.h>
+#include <stdlib.h>
+#include <sys/sysinfo.h>
+#include <string.h>
+#include "whisper.h"
+
+#define UNUSED(x) (void)(x)
+#define TAG "JNI"
+
+#define LOGI(...) __android_log_print(ANDROID_LOG_INFO,     TAG, __VA_ARGS__)
+#define LOGW(...) __android_log_print(ANDROID_LOG_WARN,     TAG, __VA_ARGS__)
+
+static inline int min(int a, int b) {
+    return (a < b) ? a : b;
+}
+
+static inline int max(int a, int b) {
+    return (a > b) ? a : b;
+}
+
+struct input_stream_context {
+    size_t offset;
+    JNIEnv * env;
+    jobject thiz;
+    jobject input_stream;
+
+    jmethodID mid_available;
+    jmethodID mid_read;
+};
+
+size_t inputStreamRead(void * ctx, void * output, size_t read_size) {
+    struct input_stream_context* is = (struct input_stream_context*)ctx;
+
+    jint avail_size = (*is->env)->CallIntMethod(is->env, is->input_stream, is->mid_available);
+    jint size_to_copy = read_size < avail_size ? (jint)read_size : avail_size;
+
+    jbyteArray byte_array = (*is->env)->NewByteArray(is->env, size_to_copy);
+
+    jint n_read = (*is->env)->CallIntMethod(is->env, is->input_stream, is->mid_read, byte_array, 0, size_to_copy);
+
+    if (size_to_copy != read_size || size_to_copy != n_read) {
+        LOGI("Insufficient Read: Req=%zu, ToCopy=%d, Available=%d", read_size, size_to_copy, n_read);
+    }
+
+    jbyte* byte_array_elements = (*is->env)->GetByteArrayElements(is->env, byte_array, NULL);
+    memcpy(output, byte_array_elements, size_to_copy);
+    (*is->env)->ReleaseByteArrayElements(is->env, byte_array, byte_array_elements, JNI_ABORT);
+
+    (*is->env)->DeleteLocalRef(is->env, byte_array);
+
+    is->offset += size_to_copy;
+
+    return size_to_copy;
+}
+bool inputStreamEof(void * ctx) {
+    struct input_stream_context* is = (struct input_stream_context*)ctx;
+
+    jint result = (*is->env)->CallIntMethod(is->env, is->input_stream, is->mid_available);
+    return result <= 0;
+}
+void inputStreamClose(void * ctx) {
+
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContextFromInputStream(
+        JNIEnv *env, jobject thiz, jobject input_stream) {
+    UNUSED(thiz);
+
+    struct whisper_context *context = NULL;
+    struct whisper_model_loader loader = {};
+    struct input_stream_context inp_ctx = {};
+
+    inp_ctx.offset = 0;
+    inp_ctx.env = env;
+    inp_ctx.thiz = thiz;
+    inp_ctx.input_stream = input_stream;
+
+    jclass cls = (*env)->GetObjectClass(env, input_stream);
+    inp_ctx.mid_available = (*env)->GetMethodID(env, cls, "available", "()I");
+    inp_ctx.mid_read = (*env)->GetMethodID(env, cls, "read", "([BII)I");
+
+    loader.context = &inp_ctx;
+    loader.read = inputStreamRead;
+    loader.eof = inputStreamEof;
+    loader.close = inputStreamClose;
+
+    loader.eof(loader.context);
+
+    context = whisper_init(&loader);
+    return (jlong) context;
+}
+
+static size_t asset_read(void *ctx, void *output, size_t read_size) {
+    return AAsset_read((AAsset *) ctx, output, read_size);
+}
+
+static bool asset_is_eof(void *ctx) {
+    return AAsset_getRemainingLength64((AAsset *) ctx) <= 0;
+}
+
+static void asset_close(void *ctx) {
+    AAsset_close((AAsset *) ctx);
+}
+
+static struct whisper_context *whisper_init_from_asset(
+        JNIEnv *env,
+        jobject assetManager,
+        const char *asset_path
+) {
+    LOGI("Loading model from asset '%s'\n", asset_path);
+    AAssetManager *asset_manager = AAssetManager_fromJava(env, assetManager);
+    AAsset *asset = AAssetManager_open(asset_manager, asset_path, AASSET_MODE_STREAMING);
+    if (!asset) {
+        LOGW("Failed to open '%s'\n", asset_path);
+        return NULL;
+    }
+
+    whisper_model_loader loader = {
+            .context = asset,
+            .read = &asset_read,
+            .eof = &asset_is_eof,
+            .close = &asset_close
+    };
+
+    return whisper_init(&loader);
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContextFromAsset(
+        JNIEnv *env, jobject thiz, jobject assetManager, jstring asset_path_str) {
+    UNUSED(thiz);
+    struct whisper_context *context = NULL;
+    const char *asset_path_chars = (*env)->GetStringUTFChars(env, asset_path_str, NULL);
+    context = whisper_init_from_asset(env, assetManager, asset_path_chars);
+    (*env)->ReleaseStringUTFChars(env, asset_path_str, asset_path_chars);
+    return (jlong) context;
+}
+
+JNIEXPORT jlong JNICALL
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_initContext(
+        JNIEnv *env, jobject thiz, jstring model_path_str) {
+    UNUSED(thiz);
+    struct whisper_context *context = NULL;
+    const char *model_path_chars = (*env)->GetStringUTFChars(env, model_path_str, NULL);
+    context = whisper_init_from_file(model_path_chars);
+    (*env)->ReleaseStringUTFChars(env, model_path_str, model_path_chars);
+    return (jlong) context;
+}
+
+JNIEXPORT void JNICALL
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_freeContext(
+        JNIEnv *env, jobject thiz, jlong context_ptr) {
+    UNUSED(env);
+    UNUSED(thiz);
+    struct whisper_context *context = (struct whisper_context *) context_ptr;
+    whisper_free(context);
+}
+
+JNIEXPORT void JNICALL
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_fullTranscribe(
+        JNIEnv *env, jobject thiz, jlong context_ptr, jfloatArray audio_data) {
+    UNUSED(thiz);
+    struct whisper_context *context = (struct whisper_context *) context_ptr;
+    jfloat *audio_data_arr = (*env)->GetFloatArrayElements(env, audio_data, NULL);
+    const jsize audio_data_length = (*env)->GetArrayLength(env, audio_data);
+
+    // Leave 2 processors free (i.e. the high-efficiency cores).
+    int max_threads = max(1, min(8, get_nprocs() - 2));
+    LOGI("Selecting %d threads", max_threads);
+
+    // The below adapted from the Objective-C iOS sample
+    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+    params.print_realtime = true;
+    params.print_progress = false;
+    params.print_timestamps = true;
+    params.print_special = false;
+    params.translate = false;
+    params.language = "en";
+    params.n_threads = max_threads;
+    params.offset_ms = 0;
+    params.no_context = true;
+    params.single_segment = false;
+
+    whisper_reset_timings(context);
+
+    LOGI("About to run whisper_full");
+    if (whisper_full(context, params, audio_data_arr, audio_data_length) != 0) {
+        LOGI("Failed to run the model");
+    } else {
+        whisper_print_timings(context);
+    }
+    (*env)->ReleaseFloatArrayElements(env, audio_data, audio_data_arr, JNI_ABORT);
+}
+
+JNIEXPORT jint JNICALL
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegmentCount(
+        JNIEnv *env, jobject thiz, jlong context_ptr) {
+    UNUSED(env);
+    UNUSED(thiz);
+    struct whisper_context *context = (struct whisper_context *) context_ptr;
+    return whisper_full_n_segments(context);
+}
+
+JNIEXPORT jstring JNICALL
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
+        JNIEnv *env, jobject thiz, jlong context_ptr, jint index) {
+    UNUSED(thiz);
+    struct whisper_context *context = (struct whisper_context *) context_ptr;
+    const char *text = whisper_full_get_segment_text(context, index);
+    jstring string = (*env)->NewStringUTF(env, text);
+    return string;
+}
--- a/examples/whisper.android/app/src/main/res/drawable/ic_launcher_background.xml
+++ b/examples/whisper.android/app/src/main/res/drawable/ic_launcher_background.xml
@ -0,0 +1,170 @@
+<?xml version="1.0" encoding="utf-8"?>
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path
+        android:fillColor="#3DDC84"
+        android:pathData="M0,0h108v108h-108z" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M9,0L9,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,0L19,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,0L29,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,0L39,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,0L49,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,0L59,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,0L69,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,0L79,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M89,0L89,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M99,0L99,108"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,9L108,9"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,19L108,19"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,29L108,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,39L108,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,49L108,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,59L108,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,69L108,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,79L108,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,89L108,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M0,99L108,99"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,29L89,29"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,39L89,39"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,49L89,49"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,59L89,59"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,69L89,69"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M19,79L89,79"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M29,19L29,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M39,19L39,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M49,19L49,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M59,19L59,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M69,19L69,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+    <path
+        android:fillColor="#00000000"
+        android:pathData="M79,19L79,89"
+        android:strokeWidth="0.8"
+        android:strokeColor="#33FFFFFF" />
+</vector>
--- a/examples/whisper.android/app/src/main/res/drawable/ic_launcher_foreground.xml
+++ b/examples/whisper.android/app/src/main/res/drawable/ic_launcher_foreground.xml
@ -0,0 +1,30 @@
+<vector xmlns:android="http://schemas.android.com/apk/res/android"
+    xmlns:aapt="http://schemas.android.com/aapt"
+    android:width="108dp"
+    android:height="108dp"
+    android:viewportWidth="108"
+    android:viewportHeight="108">
+    <path android:pathData="M31,63.928c0,0 6.4,-11 12.1,-13.1c7.2,-2.6 26,-1.4 26,-1.4l38.1,38.1L107,108.928l-32,-1L31,63.928z">
+        <aapt:attr name="android:fillColor">
+            <gradient
+                android:endX="85.84757"
+                android:endY="92.4963"
+                android:startX="42.9492"
+                android:startY="49.59793"
+                android:type="linear">
+                <item
+                    android:color="#44000000"
+                    android:offset="0.0" />
+                <item
+                    android:color="#00000000"
+                    android:offset="1.0" />
+            </gradient>
+        </aapt:attr>
+    </path>
+    <path
+        android:fillColor="#FFFFFF"
+        android:fillType="nonZero"
+        android:pathData="M65.3,45.828l3.8,-6.6c0.2,-0.4 0.1,-0.9 -0.3,-1.1c-0.4,-0.2 -0.9,-0.1 -1.1,0.3l-3.9,6.7c-6.3,-2.8 -13.4,-2.8 -19.7,0l-3.9,-6.7c-0.2,-0.4 -0.7,-0.5 -1.1,-0.3C38.8,38.328 38.7,38.828 38.9,39.228l3.8,6.6C36.2,49.428 31.7,56.028 31,63.928h46C76.3,56.028 71.8,49.428 65.3,45.828zM43.4,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2c-0.3,-0.7 -0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C45.3,56.528 44.5,57.328 43.4,57.328L43.4,57.328zM64.6,57.328c-0.8,0 -1.5,-0.5 -1.8,-1.2s-0.1,-1.5 0.4,-2.1c0.5,-0.5 1.4,-0.7 2.1,-0.4c0.7,0.3 1.2,1 1.2,1.8C66.5,56.528 65.6,57.328 64.6,57.328L64.6,57.328z"
+        android:strokeWidth="1"
+        android:strokeColor="#00000000" />
+</vector>
--- a/examples/whisper.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
+++ b/examples/whisper.android/app/src/main/res/mipmap-anydpi/ic_launcher.xml
@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<adaptive-icon xmlns:android="http://schemas.android.com/apk/res/android">
+    <background android:drawable="@drawable/ic_launcher_background" />
+    <foreground android:drawable="@drawable/ic_launcher_foreground" />
+</adaptive-icon>
--- a/examples/whisper.android/app/src/main/res/values/colors.xml
+++ b/examples/whisper.android/app/src/main/res/values/colors.xml
@ -0,0 +1,10 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+    <color name="purple_200">#FFBB86FC</color>
+    <color name="purple_500">#FF6200EE</color>
+    <color name="purple_700">#FF3700B3</color>
+    <color name="teal_200">#FF03DAC5</color>
+    <color name="teal_700">#FF018786</color>
+    <color name="black">#FF000000</color>
+    <color name="white">#FFFFFFFF</color>
+</resources>
--- a/examples/whisper.android/app/src/main/res/values/strings.xml
+++ b/examples/whisper.android/app/src/main/res/values/strings.xml
@ -0,0 +1,3 @@
+<resources>
+    <string name="app_name">WhisperCppDemo</string>
+</resources>
--- a/examples/whisper.android/app/src/main/res/values/themes.xml
+++ b/examples/whisper.android/app/src/main/res/values/themes.xml
@ -0,0 +1,5 @@
+<?xml version="1.0" encoding="utf-8"?>
+<resources>
+
+    <style name="Theme.WhisperCppDemo" parent="android:Theme.Material.Light.NoActionBar" />
+</resources>
--- a/examples/whisper.android/app/src/main/res/xml/backup_rules.xml
+++ b/examples/whisper.android/app/src/main/res/xml/backup_rules.xml
@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+   Sample backup rules file; uncomment and customize as necessary.
+   See https://developer.android.com/guide/topics/data/autobackup
+   for details.
+   Note: This file is ignored for devices older that API 31
+   See https://developer.android.com/about/versions/12/backup-restore
+-->
+<full-backup-content>
+    <!--
+   <include domain="sharedpref" path="."/>
+   <exclude domain="sharedpref" path="device.xml"/>
+-->
+</full-backup-content>
--- a/examples/whisper.android/app/src/main/res/xml/data_extraction_rules.xml
+++ b/examples/whisper.android/app/src/main/res/xml/data_extraction_rules.xml
@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="utf-8"?><!--
+   Sample data extraction rules file; uncomment and customize as necessary.
+   See https://developer.android.com/about/versions/12/backup-restore#xml-changes
+   for details.
+-->
+<data-extraction-rules>
+    <cloud-backup>
+        <!-- TODO: Use <include> and <exclude> to control what is backed up.
+        <include .../>
+        <exclude .../>
+        -->
+    </cloud-backup>
+    <!--
+    <device-transfer>
+        <include .../>
+        <exclude .../>
+    </device-transfer>
+    -->
+</data-extraction-rules>
--- a/examples/whisper.android/app/src/test/java/com/whispercppdemo/ExampleUnitTest.kt
+++ b/examples/whisper.android/app/src/test/java/com/whispercppdemo/ExampleUnitTest.kt
@ -0,0 +1,17 @@
+package com.whispercppdemo
+
+import org.junit.Test
+
+import org.junit.Assert.*
+
+/**
+ * Example local unit test, which will execute on the development machine (host).
+ *
+ * See [testing documentation](http://d.android.com/tools/testing).
+ */
+class ExampleUnitTest {
+    @Test
+    fun addition_isCorrect() {
+        assertEquals(4, 2 + 2)
+    }
+}
--- a/examples/whisper.android/build.gradle
+++ b/examples/whisper.android/build.gradle
@ -0,0 +1,6 @@
+// Top-level build file where you can add configuration options common to all sub-projects/modules.
+plugins {
+    id 'com.android.application' version '7.3.1' apply false
+    id 'com.android.library' version '7.3.1' apply false
+    id 'org.jetbrains.kotlin.android' version '1.7.10' apply false
+}
--- a/examples/whisper.android/gradle.properties
+++ b/examples/whisper.android/gradle.properties
@ -0,0 +1,23 @@
+# Project-wide Gradle settings.
+# IDE (e.g. Android Studio) users:
+# Gradle settings configured through the IDE *will override*
+# any settings specified in this file.
+# For more details on how to configure your build environment visit
+# http://www.gradle.org/docs/current/userguide/build_environment.html
+# Specifies the JVM arguments used for the daemon process.
+# The setting is particularly useful for tweaking memory settings.
+org.gradle.jvmargs=-Xmx2048m -Dfile.encoding=UTF-8
+# When configured, Gradle will run in incubating parallel mode.
+# This option should only be used with decoupled projects. More details, visit
+# http://www.gradle.org/docs/current/userguide/multi_project_builds.html#sec:decoupled_projects
+# org.gradle.parallel=true
+# AndroidX package structure to make it clearer which packages are bundled with the
+# Android operating system, and which are packaged with your app's APK
+# https://developer.android.com/topic/libraries/support-library/androidx-rn
+android.useAndroidX=true
+# Kotlin code style for this project: "official" or "obsolete":
+kotlin.code.style=official
+# Enables namespacing of each library's R class so that its R class includes only the
+# resources declared in the library itself and none from the library's dependencies,
+# thereby reducing the size of the R class for that library
+android.nonTransitiveRClass=true
--- a/examples/whisper.android/gradle/wrapper/gradle-wrapper.jar
+++ b/examples/whisper.android/gradle/wrapper/gradle-wrapper.jar
--- a/examples/whisper.android/gradle/wrapper/gradle-wrapper.properties
+++ b/examples/whisper.android/gradle/wrapper/gradle-wrapper.properties
@ -0,0 +1,6 @@
+#Wed Dec 14 10:37:24 EST 2022
+distributionBase=GRADLE_USER_HOME
+distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
+distributionPath=wrapper/dists
+zipStorePath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
--- a/examples/whisper.android/gradlew
+++ b/examples/whisper.android/gradlew
@ -0,0 +1,185 @@
+#!/usr/bin/env sh
+
+#
+# Copyright 2015 the original author or authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"'
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin or MSYS, switch paths to Windows format before running java
+if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=`expr $i + 1`
+    done
+    case $i in
+        0) set -- ;;
+        1) set -- "$args0" ;;
+        2) set -- "$args0" "$args1" ;;
+        3) set -- "$args0" "$args1" "$args2" ;;
+        4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=`save "$@"`
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+exec "$JAVACMD" "$@"
--- a/examples/whisper.android/gradlew.bat
+++ b/examples/whisper.android/gradlew.bat
@ -0,0 +1,89 @@
+@rem
+@rem Copyright 2015 the original author or authors.
+@rem
+@rem Licensed under the Apache License, Version 2.0 (the "License");
+@rem you may not use this file except in compliance with the License.
+@rem You may obtain a copy of the License at
+@rem
+@rem      https://www.apache.org/licenses/LICENSE-2.0
+@rem
+@rem Unless required by applicable law or agreed to in writing, software
+@rem distributed under the License is distributed on an "AS IS" BASIS,
+@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+@rem See the License for the specific language governing permissions and
+@rem limitations under the License.
+@rem
+
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Resolve any "." and ".." in APP_HOME to make it shorter.
+for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m"
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto execute
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %*
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
--- a/examples/whisper.android/settings.gradle
+++ b/examples/whisper.android/settings.gradle
@ -0,0 +1,16 @@
+pluginManagement {
+    repositories {
+        gradlePluginPortal()
+        google()
+        mavenCentral()
+    }
+}
+dependencyResolutionManagement {
+    repositoriesMode.set(RepositoriesMode.FAIL_ON_PROJECT_REPOS)
+    repositories {
+        google()
+        mavenCentral()
+    }
+}
+rootProject.name = "WhisperCppDemo"
+include ':app'
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Georgi Gerganov	7aa1174315	bench : fix Windows linkage by moving ggml benches in whisper lib ..	2023-01-18 21:16:25 +02:00
Georgi Gerganov	1290fc6457	bench : add memcpy and ggml_mul_mat benchmarks	2023-01-18 20:31:46 +02:00
Digipom	49b529ba74	whisper.android : add support for loading directly from asset in C (#415 )	2023-01-16 21:57:35 +02:00
Georgi Gerganov	8088a977af	whisper : fix possible uninitialized variables (#291 )	2023-01-16 21:44:40 +02:00
Georgi Gerganov	c9aeb33676	stream : fix --keep_context argument to be used correctly (#354 )	2023-01-16 19:37:40 +02:00
Damian Czaja	4a3f0d3fe9	go : remove sample_best and sample_timestamp bindings (#409 )	2023-01-16 19:18:10 +02:00
Georgi Gerganov	874bde887e	Update README.md	2023-01-16 18:47:31 +02:00
Georgi Gerganov	8738427dd6	cmake : bump version to 1.1.0	2023-01-15 14:33:13 +02:00
Georgi Gerganov	c3991bbb24	Update README.md	2023-01-15 14:08:12 +02:00
Georgi Gerganov	00ea21668b	whisper : account speed_up flag for short audio (close #405 )	2023-01-15 12:42:15 +02:00
Georgi Gerganov	0b85e8c401	Update README.md	2023-01-15 11:36:20 +02:00
Georgi Gerganov	fafd78945d	bench.wasm : print system info	2023-01-15 11:34:03 +02:00
Georgi Gerganov	8de452c18b	Improve decoding (#291 ) * whisper : prepare infra for new decoding strategies * whisper : apply logit filters and compute logprobs * whisper : add whisper_get_logits() * whisper : separate self and cross attention memory Initial step needed for supporting parallel decoders * whisper : move probs_id buffer to whisper_context * whisper : refactor kv cache into separate struct * whisper : move self-attention kv cache to whisper_decoder * whisper : wip decoding parameters + strategies * whisper : wip decoding parameters + strategies (part 2) * whisper : wip decoding parameters + strategies (part 3) * whisper : wip decoding parameters + strategies (part 4) * whisper : fix prompt_past update to not include prompt_init * whisper : temperature + best_of support * whisper : support for compression_ration_threshold We actually use entropy, but it is similar * command : fix example to use logits instead of obsolete probs * whisper : handle empty sequence ranking * whisper : add WHISPER_DEBUG + diagnostic prints + new main args * whisper : minor fixes * whisper : add beam-search support * whisper : bug fix when there no previous context * whisper : add comments * stream : disable temperature fallback For real-time processing, we always want a single decoder running at T=0 * whisper.swiftui : update example - fix paths + add empty folders	2023-01-15 11:29:57 +02:00
Georgi Gerganov	a6dbd9188b	stream : fix a bug that inserted a lot of empty audio at the start The quality was terrible due to this	2023-01-14 19:20:47 +02:00
Georgi Gerganov	4ef3398e8f	ggml : remove obsolete zeroing + comment fixes (#390 )	2023-01-08 20:21:03 +02:00
Ian Bicking	5e9f33596f	readme : clarify main and stream usage (#391 ) Give an example of ./main that uses a sample file that's already there, and make the stream example clarify you need `make stream`	2023-01-08 20:18:41 +02:00
Abitofevrything	8d7b29cedd	ggml : correct behaviour of ggml_vec_sum_f32 (#390 )	2023-01-08 20:06:09 +02:00
boolemancer	08dc705a69	whisper : fix sample_to_timestamp calculation with 64 bit precision to avoid overflow (#388 ) * Do calculation with 64 bit precision to avoid overflow * Update whisper.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-01-08 15:08:45 +02:00
Syahmi Azhar	1512545149	whisper : add loader class to allow loading from buffer and others (#353 ) * whisper : add loader to allow loading from other than file * whisper : rename whisper_init to whisper_init_from_file * whisper : add whisper_init_from_buffer * android : Delete local.properties * android : load models directly from assets * whisper : adding <stddef.h> needed for size_t + code style Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-01-08 13:03:33 +02:00
Georgi Gerganov	52a3e0c92a	ggml : improve vec_dot_f16 unrolling in flash_attn_f16	2023-01-08 11:41:18 +02:00
Georgi Gerganov	d1ea1220ff	command : clean-up / refactoring / formatting (#383 )	2023-01-07 21:43:24 +02:00
David	9c4a1522f6	command : always-prompt mode (#383 )	2023-01-07 21:41:11 +02:00
David Thorpe	f078a6f20e	go : adding features to the go-whisper example, go ci, etc (#384 ) * Updated bindings so they can be used in third pary packages. * Updated makefiles to set FMA flag on optionally, for xeon E5 on Darwin * Added test script * Changes for examples * Reverted * Made the NewContext method private	2023-01-07 21:21:43 +02:00
Georgi Gerganov	f30b5d322c	ggml : fix bug in new soft max computation	2023-01-07 21:00:07 +02:00
Georgi Gerganov	44efbf7ff1	cmake : add -Wno-unused-function + update whisper.js	2023-01-07 20:18:34 +02:00
Georgi Gerganov	d347a59a5f	ggml : when using BLAS start only 1 CPU thread	2023-01-07 19:48:56 +02:00
Georgi Gerganov	6394c906af	ggml : fix running tasks with variable number of threads	2023-01-07 19:20:18 +02:00
Georgi Gerganov	74ffa14e1d	ggml : unroll ggml_vec_dot_f16 in ggml_compute_forward_flash_attn_f16	2023-01-07 19:19:40 +02:00
Georgi Gerganov	65fdcbbbbb	whisper : revert accidental MB change	2023-01-07 16:18:21 +02:00
Georgi Gerganov	d61d55cd4b	ggml : speed-up soft max via Accelerate + unroll	2023-01-07 16:16:42 +02:00
Georgi Gerganov	d51fc3ee0a	ggml : use vDSP_sve and vDSP_maxv from Accelerate	2023-01-07 16:10:16 +02:00
Georgi Gerganov	f82a7dd019	ggml : make gcc happy (minor)	2023-01-07 09:34:39 +02:00
Georgi Gerganov	87dd4a3081	talk.wasm : bump memory usage + update whisper.js	2023-01-06 21:13:44 +02:00
m.bell	41e05c6b1b	cmake : support AVX2 in Windows better (#381 )	2023-01-06 19:36:33 +02:00
Georgi Gerganov	fa379cb22a	Revert "tmp" This reverts commit `1652965529`.	2023-01-06 19:33:09 +02:00
David Thorpe	322f4e6c4e	go : bindings updated so they can be used in third party packages. (#379 ) * Updated bindings so they can be used in third pary packages. * Updated makefiles to set FMA flag on optionally, for xeon E5 on Darwin	2023-01-06 19:32:28 +02:00
Georgi Gerganov	1652965529	tmp	2023-01-06 19:32:12 +02:00
Georgi Gerganov	6042c7a3be	cmake : change min required version to 3.0 (#351 ) We increase the min version only when want to use particular functionality that is available in the newer version	2023-01-06 19:25:28 +02:00
Georgi Gerganov	6b351bb669	command : add "guided-mode" video demo in the README.md	2023-01-06 18:59:26 +02:00
Abitofevrything	a62170c656	ggml : add SSE3 and fp16 conversion lookup table (#368 ) * Improves WASM performance: On MacBook M1 Pro, I observe 25% faster using Firefox and 35% faster using Chrome * Add support for SSE3 SIMD * Add SSE3 to system information * Add Imath support for fp16-fp32 conversions * Add Imath to system information * Wrap Imath calls to avoid static function warnings * Drop Imath; Add lookup table for f16 -> f32 conversions * Remove TODO comments * Update SSE3 to new macro arguments * Correct updated macro definitions * Prefer static inline where possible * ggml : static inlines + add public f16 <-> f32 conversions Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-01-06 18:45:59 +02:00
Thomas Fitzsimmons	1944e7c33e	whisper : document POWER VSX support	2023-01-05 23:53:00 +02:00
Thomas Fitzsimmons	49a8dd6732	ggml : reorganize POWER9 ppc64le SIMD code	2023-01-05 23:53:00 +02:00
Thomas Fitzsimmons	8c7f642286	ggml : change f16 load and store macro arguments	2023-01-05 23:53:00 +02:00
Georgi Gerganov	ad2a4ffa03	whisper : do not use F16 tensors when in F32 mode (#369 )	2023-01-05 22:56:25 +02:00
Georgi Gerganov	b3c865083e	ci : add emscripten build	2023-01-05 22:10:20 +02:00
Georgi Gerganov	a0d4f8e65c	main : make whisper_print_segment_callback() more readable (close #371 )	2023-01-05 21:45:05 +02:00
Georgi Gerganov	4a214d2f07	cmake : add CMAKE_RUNTIME_OUTPUT_DIRECTORY Currently needed by the wasm examples	2023-01-05 21:40:59 +02:00
Georgi Gerganov	0a0cfa7985	ggml : add void to argument-less functions	2023-01-05 21:40:38 +02:00
Georgi Gerganov	196d738974	minor : close #370 + Makefile build info print change	2023-01-05 21:35:45 +02:00
Andy Maloney	84c6b42e65	cmake : update to 3.19 (#351 ) - update from 3.0 (from 2014) to 3.19 (from 2020) - move some global setting onto the targets (through a cmake include)	2023-01-05 21:22:48 +02:00
Andy Maloney	dd6d582977	whisper : use ranged-based for loops for readability	2023-01-05 21:20:44 +02:00
Georgi Gerganov	d51c5eb906	ggml : define MIN / MAX only if not defined (minor)	2023-01-05 21:16:52 +02:00
Georgi Gerganov	0be6a1afd9	make : print build information	2023-01-02 13:35:26 +02:00
Georgi Gerganov	a466c3404d	stream : fix data race on bool + avoid division-by-zero	2023-01-02 10:20:50 +02:00
Georgi Gerganov	d629c034a4	models : fix HF model URL (close #356 )	2023-01-02 09:54:43 +02:00
Andy Maloney	f00509d57c	command : refactor to split command list & general transcription modes (#331 ) This makes it easier to understand if you're looking for only one of the capabilities.	2022-12-31 14:08:57 +02:00
Thomas Fitzsimmons	424c410c42	ggml : improve f16 acceleration for POWER9 ppc64le	2022-12-31 10:02:19 +02:00
Georgi Gerganov	d97e6005e9	whisper : add whisper_n_audio_ctx and check for invalid audio_ctx closes #344	2022-12-31 09:57:19 +02:00
Ikko Ashimine	3467230a77	models : fix typo in convert-h5-to-ggml.py signficant -> significant	2022-12-31 09:49:01 +02:00
Avik Sengupta	a091581eb3	cmake : add runtime destination install (#345 ) needed for mingw32 build to successfully install the dlls in the correct location	2022-12-31 09:48:00 +02:00
Georgi Gerganov	68daf6e487	whisper : avoid some memory allocations	2022-12-30 13:43:48 +02:00
Niels Mayer	a593b932e4	main : add -ocsv, aka --output-csv to output a CSV file Adds -ocsv, aka --output-csv feature to examples/main, which outputs a CSV file containing lines formatted as follows <startTime-in-integer-milliseconds>, <endTime-in-integer-milliseconds>, "<transcript-line-including-commas>".	2022-12-29 14:04:00 +02:00
Georgi Gerganov	9a8ad3db69	make : add i686 arch (close #329 )	2022-12-29 13:58:55 +02:00
Georgi Gerganov	4e0b2069e7	ggml : barrier refactor + static functions	2022-12-28 19:00:53 +02:00
Georgi Gerganov	ac521a566e	ggml : simplify the SIMD code (#324 ) * ggml : simplify the SIMD code * ggml : generic reduce for all register sizes + comments	2022-12-24 10:22:28 +02:00
Andy Maloney	331c0bbddc	examples : fix memory leak on failure to load gpt2 model (#323 )	2022-12-23 20:19:07 +02:00
Andy Maloney	dc90efd504	examples : small code cleanups (#322 ) - remove unnecessary initialization of string to "" - use empty() instead of checking size() - use emplace_back instead of push_back - use nullptr instead of NULL - remove unnecessary call to .data() on string - use character overload of find_first_of() instead of passing a string	2022-12-23 20:18:51 +02:00
Georgi Gerganov	7282e2109e	ggml : use vaddvq_f32 for slightly more efficient reduce	2022-12-23 13:48:19 +02:00
Thomas Fitzsimmons	466ceebb78	ggml : add f16 acceleration for POWER9 ppc64le	2022-12-23 13:23:58 +02:00
Georgi Gerganov	77226aa89d	models : fix support for spaces in path (close #315 )	2022-12-23 11:11:38 +02:00
Andy Maloney	543bd5627e	whisper : use emplace_back in place of push_back (#319 ) This avoids potential construction of temporaries.	2022-12-23 11:07:19 +02:00
Andy Maloney	62fee9a9cc	whisper : fix mem leak on failure to load model (#318 )	2022-12-23 11:06:17 +02:00
Andy Maloney	493d94130d	ggml : make consts static (#317 ) These shouldn't be able to be referenced outside the compilation unit.	2022-12-23 11:05:27 +02:00
Georgi Gerganov	1480a5f1af	Update README.md Add SwiftUI example links	2022-12-23 11:02:46 +02:00
Digipom	0f4227d9ee	examples : add whisper.swiftui demo app (#308 ) * Add SwiftUI demo project. * Add -DGGML_USE_ACCELERATE	2022-12-23 10:56:18 +02:00
Georgi Gerganov	4c1fe0c813	Update README.md Add bindings links / discussions	2022-12-22 18:22:58 +02:00
Andy Maloney	fa463313ad	minor : small code cleanups (#302 ) * Small code cleanups - fix indentation - remove extra semicolons - remove extra break after returns in case statements - remove unnecessary call to .data() on string - use empty() instead of checking size() - no need to check for nullptr before free - remove unnecessary initialization of string to "" * minor : switch case always break Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2022-12-22 17:06:19 +02:00
Georgi Gerganov	501a6b455c	minor : flag "ARM FMA" -> "ARM_FMA"	2022-12-22 16:47:54 +02:00
Kevin Brothaler	91fc08c641	Build a vfpv4 library for armeabi-v7a and do runtime detection to select the right library	2022-12-22 16:47:54 +02:00
Kevin Brothaler	e1432dd91a	Check for both __ARM_NEON and __ARM_FEATURE_FMA so that the project can be compiled for armv7a. Android armeabi-v7a's NEON support doesn't support FMA unless configured with `-mfpu=neon-fp-armv8`, which would need runtime checks. * Also removed ABI filter from Android project.	2022-12-22 16:47:54 +02:00
Kevin Brothaler	22193cbfe8	Bump NDK version	2022-12-22 16:47:54 +02:00
Andy Maloney	42c6730732	whisper : use nullptr (C++11) instead of NULL macro (#299 )	2022-12-22 16:35:18 +02:00
Andy Maloney	76b6211f9b	cmake : add headers to target (#298 ) This will show the header files in IDEs.	2022-12-22 16:34:47 +02:00
Mohit Agarwal	86a277f78d	go : run `go mod tidy` before building examples + fix permissions (#296 ) * run `go mod tidy` before building examples Running `make examples` after cloning the repository gives the following error: ``` ... [100%] Built target whisper gmake[3]: Leaving directory '/tmp/exp/whisper.cpp/bindings/go/build' gmake[2]: Leaving directory '/tmp/exp/whisper.cpp/bindings/go/build' gmake[1]: Leaving directory '/tmp/exp/whisper.cpp/bindings/go/build' Build example go-model-download Build example go-whisper examples/go-whisper/process.go:11:2: missing go.sum entry for module providing package github.com/go-audio/wav (imported by github.com/ggerganov/whisper.cpp/bindings/go/examples/go-whisper); to add: go get github.com/ggerganov/whisper.cpp/bindings/go/examples/go-whisper make: *** [Makefile:26: examples/go-whisper] Error 1 ``` * remove executable bit from various files	2022-12-22 16:34:20 +02:00
David Thorpe	231bebca7d	bindings : initial import of golang bindings (#287 ) * Initial import of golang bindings * Updated makefile rules * Updated bindings * Makefile update to add in more tests	2022-12-20 08:54:33 +02:00
Georgi Gerganov	90564f85f9	Update README.md	2022-12-19 22:09:21 +02:00
Georgi Gerganov	99da1e5cc8	cmake : enable and fix -Wall -Wextra -Wpedantic C++ warnings	2022-12-19 20:45:08 +02:00
Matheus de Sousa	8e3f129b4d	minor : resolves some of warnings when compiling with clang/clang++ (#294 ) * Resolves some of warnings when compiling with clang/clang++ Mostly nit stuff that clang catches when compiling with -Wall -Wextra -pedantic. - Fix comparison between sign/unsigned integers. - Passes a constant reference (const&) instead of copying each time. * minor : normalize coding style * minor : fix warning Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2022-12-19 20:19:01 +02:00
Georgi Gerganov	1d716d6e34	release : v1.0.4	2022-12-17 19:52:42 +02:00
katsu560	419b8a6402	Add AVX,AVX2 support for ggml_vec_scale_f32	2022-12-17 19:40:10 +02:00
Georgi Gerganov	1eb81f863f	make : revert accidental change of optimization flags	2022-12-17 18:57:42 +02:00
Georgi Gerganov	fba10a4c68	whisper : language auto-detect (#59 )	2022-12-17 18:49:44 +02:00
Georgi Gerganov	afe2db0fe2	Add Roadmap	2022-12-16 23:41:57 +02:00
Georgi Gerganov	a7047b2a28	ggml : implement ggml_compute_forward_dup_f16() special cases	2022-12-16 21:50:41 +02:00
Georgi Gerganov	32fbc8cd04	main : add option to print the progress (#276 )	2022-12-16 20:20:43 +02:00
Georgi Gerganov	b8065d90f5	main : add "--prompt" command line argument (#90 ) This allows to provide an initial prompt to be used at the start of the processing.	2022-12-16 19:43:16 +02:00
Georgi Gerganov	4312995974	command : better indentation	2022-12-16 19:38:18 +02:00
Georgi Gerganov	5eeeb3412d	command : update README, show how to use guided mode	2022-12-16 19:38:18 +02:00
Georgi Gerganov	6a69e3ae27	command : adding guided mode	2022-12-16 19:38:18 +02:00
Georgi Gerganov	bf69b669a0	whisper : add whisper_tokenize() Tokenizes a string into a list of vocabulary tokens	2022-12-16 19:38:18 +02:00
Georgi Gerganov	ea19ed33f1	Update README.md (#46 ) Add references to the new Android app	2022-12-16 19:28:51 +02:00
Digipom	675e787171	Add Android sample (#277 ) * Add Android sample * Use main project C files * Stop existing playback before starting new playback * Make text scrollable * Stop playback when starting to record * Remove extra var	2022-12-16 19:20:13 +02:00
Georgi Gerganov	c6c3ad5a98	ci : add Windows build without OpenBLAS + change to Release (#85 ) (#282 )	2022-12-16 18:51:46 +02:00
Georgi Gerganov	6a7c82501e	whisper : improve decoding strategy (#244 ) - Clear past prompt when there is very short audio left for processing. My observation is that in these cases the decoding tends to repeat and hallucinate stuff and I think this is induced by the existing prompt - When we fail to sample timestamp token, retry by clearing the past prompt. If it fails again, then we advance the window by 1 second	2022-12-16 18:34:35 +02:00
Georgi Gerganov	a82d331034	stream : update README.md + comments	2022-12-16 18:04:19 +02:00
Georgi Gerganov	c37c2443c1	Update README.md (#56 )	2022-12-16 18:01:05 +02:00
Georgi Gerganov	0f11759406	ggml : make more compatible with c99 (#262 )	2022-12-16 18:00:12 +02:00
Georgi Gerganov	5a5c5ddcca	Update README.md	2022-12-15 20:38:08 +02:00
Georgi Gerganov	34e0b4b9ef	stream : fix build	2022-12-15 20:15:36 +02:00
Georgi Gerganov	b0f8013eb9	stream : add sliding window mode	2022-12-15 19:59:17 +02:00
Georgi Gerganov	124c718c73	whisper : fix UB when reading buffer of length 0 bytes (#265 )	2022-12-13 23:14:47 +02:00
Georgi Gerganov	f66ac6dc4f	ggml : fix indentation	2022-12-13 23:09:21 +02:00
Georgi Gerganov	9955fa4ed7	ggml : make compatible with c99 (#262 )	2022-12-13 23:07:49 +02:00
Georgi Gerganov	a613f16aec	talk : improve prompting	2022-12-12 23:44:36 +02:00