ggml : initial tests with libnvblas

2025-07-01 23:10:47 +02:00 · 2022-12-08 22:01:52 +02:00
43 changed files with 181 additions and 2882 deletions
--- a/.gitignore
+++ b/.gitignore
@ -11,19 +11,14 @@ build-release/
 build-sanitize-addr/
 build-sanitize-thread/

-/main
-/stream
-/command
-/talk
-/bench
-
+main
+stream
+command
+bench
 sync.sh
-libwhisper.so
 compile_commands.json

 examples/arm_neon.h
 examples/whisper.objc/whisper.objc.xcodeproj/xcshareddata
 examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
-
-extra/bench-gg.txt
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,5 +1,5 @@
 cmake_minimum_required (VERSION 3.0)
-project(whisper.cpp VERSION 1.0.3)
+project(whisper.cpp VERSION 1.0.0)

 set(CMAKE_EXPORT_COMPILE_COMMANDS "on")
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
@ -14,7 +14,6 @@ if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
        configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
    endif()
-    configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/package-tmpl.json ${CMAKE_SOURCE_DIR}/bindings/javascript/package.json @ONLY)
 else()
    set(WHISPER_STANDALONE OFF)
 endif()
@ -82,7 +81,7 @@ endif()
 # dependencies

 set(CMAKE_C_STANDARD   11)
-set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD 20)

 find_package(Threads REQUIRED)

@ -152,7 +151,8 @@ else()
        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
    else()
        if (EMSCRIPTEN)
-            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread")
+            # we require support for WASM SIMD 128-bit
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -msimd128")
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
        else()
            if(NOT WHISPER_NO_AVX)
@ -203,10 +203,6 @@ if (BUILD_SHARED_LIBS)
        )
 endif()

-if (EMSCRIPTEN)
-    set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
-endif()
-
 target_compile_definitions(${TARGET} PUBLIC
    ${WHISPER_EXTRA_FLAGS}
    )
@ -226,11 +222,13 @@ add_subdirectory(bindings)
 # programs, examples and tests
 #

-if (WHISPER_BUILD_TESTS)
-    enable_testing()
-    add_subdirectory(tests)
-endif ()
+if (WHISPER_STANDALONE)
+    if (WHISPER_BUILD_TESTS)
+        enable_testing()
+        add_subdirectory(tests)
+    endif ()

-if (WHISPER_BUILD_EXAMPLES)
-    add_subdirectory(examples)
-endif()
+    if (WHISPER_BUILD_EXAMPLES)
+        add_subdirectory(examples)
+    endif()
+endif ()
--- a/30
+++ b/30
@ -27,8 +27,8 @@ endif
 # Compile flags
 #

-CFLAGS   = -I.              -O3 -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
+CFLAGS   = -I.              -O3 -std=c11  
+CXXFLAGS = -I. -I./examples -O3 -std=c++11
 LDFLAGS  =

 # OS specific
@ -45,10 +45,6 @@ ifeq ($(UNAME_S),FreeBSD)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
-ifeq ($(UNAME_S),Haiku)
-	CFLAGS   += -pthread
-	CXXFLAGS += -pthread
-endif

 # Architecture specific
 # TODO: probably these flags need to be tweaked on some architectures
@ -81,23 +77,6 @@ ifeq ($(UNAME_M),x86_64)
 		ifneq (,$(findstring f16c,$(F16C_M)))
 			CFLAGS += -mf16c
 		endif
-	else ifeq ($(UNAME_S),Haiku)
-		AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
-		ifneq (,$(findstring avx,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
-		ifneq (,$(findstring avx2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell sysinfo -cpu | grep "FMA ")
-		ifneq (,$(findstring fma,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell sysinfo -cpu | grep "F16C ")
-		ifneq (,$(findstring f16c,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
 	else
 		CFLAGS += -mfma -mf16c -mavx -mavx2
 	endif
@ -154,7 +133,7 @@ libwhisper.so: ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)

 clean:
-	rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command bench libwhisper.a libwhisper.so

 #
 # Examples
@ -172,9 +151,6 @@ stream: examples/stream/stream.cpp ggml.o whisper.o
 command: examples/command/command.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/command/command.cpp ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)

-talk: examples/talk/talk.cpp  examples/talk/gpt-2.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
-
 bench: examples/bench/bench.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)

--- a/README.md
+++ b/README.md
@ -2,7 +2,6 @@

 [![Actions Status](https://github.com/ggerganov/whisper.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/whisper.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
-[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -449,8 +448,8 @@ in [models](models).

 - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm)
- [X] Javascript: [bindings/javascript](bindings/javascript)
- [ ] Python: soon
+- [ ] Python:
+- [ ] Java:

 ## Examples

@ -460,10 +459,10 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
 | Example | Web | Description |
 | ---     | --- | ---         |
 | [main](examples/main) | [whisper.wasm](examples/whisper.wasm) | Tool for translating and transcribing audio using Whisper |
-| [bench](examples/bench) | [bench.wasm](examples/bench.wasm) | Benchmark the performance of Whisper on your machine |
+| [bench](examples/bench) | | Benchmark the performance of Whisper on your machine |
 | [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
 | [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
-| [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
+| | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot in your browser |
 | [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
 | [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
 | [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
--- a/bindings/CMakeLists.txt
+++ b/bindings/CMakeLists.txt
@ -1,19 +1,3 @@
 if (EMSCRIPTEN)
    add_subdirectory(javascript)
-
-    add_custom_command(
-        OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/javascript/publish.log
-        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/javascript/whisper.js
-        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/javascript/libwhisper.worker.js
-        DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/javascript/package.json
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/javascript
-        COMMAND npm publish
-        COMMAND touch publish.log
-        COMMENT "Publishing npm module v${PROJECT_VERSION}"
-        VERBATIM
-        )
-
-    add_custom_target(publish-npm
-        DEPENDS javascript/publish.log
-        )
 endif()
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/CMakeLists.txt
+++ b/bindings/javascript/CMakeLists.txt
@ -20,22 +20,15 @@ if (WHISPER_WASM_SINGLE_FILE)
        ${CMAKE_BINARY_DIR}/bin/libwhisper.js
        ${CMAKE_CURRENT_SOURCE_DIR}/whisper.js
        )
-
-    add_custom_command(
-        TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy
-        ${CMAKE_BINARY_DIR}/bin/libwhisper.worker.js
-        ${CMAKE_CURRENT_SOURCE_DIR}/libwhisper.worker.js
-        )
 endif()

 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
-    -s MODULARIZE=1 \
-    -s EXPORT_NAME=\"'whisper_factory'\" \
-    -s FORCE_FILESYSTEM=1 \
    -s USE_PTHREADS=1 \
    -s PTHREAD_POOL_SIZE=8 \
-    -s ALLOW_MEMORY_GROWTH=1 \
+    -s INITIAL_MEMORY=1610612736 \
+    -s TOTAL_MEMORY=1610612736 \
+    -s FORCE_FILESYSTEM=1 \
+    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
    ")
--- a/bindings/javascript/README.md
+++ b/bindings/javascript/README.md
@ -1,78 +0,0 @@
-# whisper.cpp
-
-Node.js package for Whisper speech recognition
-
-Package: https://www.npmjs.com/package/whisper.cpp
-
-## Details
-
-The performance is comparable to when running `whisper.cpp` in the browser via WASM.
-
-The API is currently very rudimentary: [bindings/javascript/emscripten.cpp](/bindings/javascript/emscripten.cpp)
-
-For sample usage check [tests/test-whisper.js](/tests/test-whisper.js)
-
-## Package building + test
-
-```bash
-# load emscripten
-source /path/to/emsdk/emsdk_env.sh
-
-# clone repo
-git clone https://github.com/ggerganov/whisper.cpp
-cd whisper.cpp
-
-# grab base.en model
-./models/download-ggml-model.sh base.en
-
-# prepare PCM sample for testing
-ffmpeg -i samples/jfk.wav -f f32le -acodec pcm_f32le samples/jfk.pcmf32
-
-# build
-mkdir build-em && cd build-em
-emcmake cmake .. && make -j
-
-# run test
-node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
-
-# publish npm package
-make publish-npm
-```
-
-## Sample run
-
-```java
-$ node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
-
-whisper_model_load: loading model from 'whisper.bin'
-whisper_model_load: n_vocab       = 51864
-whisper_model_load: n_audio_ctx   = 1500
-whisper_model_load: n_audio_state = 512
-whisper_model_load: n_audio_head  = 8
-whisper_model_load: n_audio_layer = 6
-whisper_model_load: n_text_ctx    = 448
-whisper_model_load: n_text_state  = 512
-whisper_model_load: n_text_head   = 8
-whisper_model_load: n_text_layer  = 6
-whisper_model_load: n_mels        = 80
-whisper_model_load: f16           = 1
-whisper_model_load: type          = 2
-whisper_model_load: adding 1607 extra tokens
-whisper_model_load: mem_required  =  506.00 MB
-whisper_model_load: ggml ctx size =  140.60 MB
-whisper_model_load: memory size   =   22.83 MB
-whisper_model_load: model size    =  140.54 MB
-
-system_info: n_threads = 8 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | NEON = 0 | F16C = 0 | FP16_VA = 0 | WASM_SIMD = 1 | BLAS = 0 | 
-
-operator(): processing 176000 samples, 11.0 sec, 8 threads, 1 processors, lang = en, task = transcribe ...
-
-[00:00:00.000 --> 00:00:11.000]   And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
-
-whisper_print_timings:     load time =   162.37 ms
-whisper_print_timings:      mel time =   183.70 ms
-whisper_print_timings:   sample time =     4.27 ms
-whisper_print_timings:   encode time =  8582.63 ms / 1430.44 ms per layer
-whisper_print_timings:   decode time =   436.16 ms / 72.69 ms per layer
-whisper_print_timings:    total time =  9370.90 ms
-```
--- a/bindings/javascript/emscripten.cpp
+++ b/bindings/javascript/emscripten.cpp
@ -1,48 +1,63 @@
-//
-// This is the Javascript API of whisper.cpp
-//
-// Very crude at the moment.
-// Feel free to contribute and make this better!
-//
-// See the tests/test-whisper.js for sample usage
-//
-
 #include "whisper.h"

 #include <emscripten.h>
 #include <emscripten/bind.h>

-#include <thread>
 #include <vector>
+#include <thread>

-struct whisper_context * g_context;
+std::thread g_worker;
+
+std::vector<struct whisper_context *> g_contexts(4, nullptr);

 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
-        if (g_context == nullptr) {
-            g_context = whisper_init(path_model.c_str());
-            if (g_context != nullptr) {
-                return true;
-            } else {
-                return false;
+        if (g_worker.joinable()) {
+            g_worker.join();
+        }
+
+        for (size_t i = 0; i < g_contexts.size(); ++i) {
+            if (g_contexts[i] == nullptr) {
+                g_contexts[i] = whisper_init(path_model.c_str());
+                if (g_contexts[i] != nullptr) {
+                    return i + 1;
+                } else {
+                    return (size_t) 0;
+                }
            }
        }

-        return false;
+        return (size_t) 0;
    }));

-    emscripten::function("free", emscripten::optional_override([]() {
-        if (g_context) {
-            whisper_free(g_context);
-            g_context = nullptr;
+    emscripten::function("free", emscripten::optional_override([](size_t index) {
+        if (g_worker.joinable()) {
+            g_worker.join();
+        }
+
+        --index;
+
+        if (index < g_contexts.size()) {
+            whisper_free(g_contexts[index]);
+            g_contexts[index] = nullptr;
        }
    }));

-    emscripten::function("full_default", emscripten::optional_override([](const emscripten::val & audio, const std::string & lang, bool translate) {
-        if (g_context == nullptr) {
+    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
+        if (g_worker.joinable()) {
+            g_worker.join();
+        }
+
+        --index;
+
+        if (index >= g_contexts.size()) {
            return -1;
        }

+        if (g_contexts[index] == nullptr) {
+            return -2;
+        }
+
        struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);

        params.print_realtime   = true;
@ -50,7 +65,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        params.print_timestamps = true;
        params.print_special    = false;
        params.translate        = translate;
-        params.language         = whisper_is_multilingual(g_context) ? lang.c_str() : "en";
+        params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
        params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
        params.offset_ms        = 0;

@ -67,11 +82,9 @@ EMSCRIPTEN_BINDINGS(whisper) {

        // print system information
        {
-            printf("\n");
            printf("system_info: n_threads = %d / %d | %s\n",
                    params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());

-            printf("\n");
            printf("%s: processing %d samples, %.1f sec, %d threads, %d processors, lang = %s, task = %s ...\n",
                    __func__, int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
                    params.n_threads, 1,
@ -81,11 +94,13 @@ EMSCRIPTEN_BINDINGS(whisper) {
            printf("\n");
        }

-        // run whisper
+        // run the worker
        {
-            whisper_reset_timings(g_context);
-            whisper_full(g_context, params, pcmf32.data(), pcmf32.size());
-            whisper_print_timings(g_context);
+            g_worker = std::thread([index, params, pcmf32 = std::move(pcmf32)]() {
+                whisper_reset_timings(g_contexts[index]);
+                whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
+                whisper_print_timings(g_contexts[index]);
+            });
        }

        return 0;
--- a/bindings/javascript/libwhisper.worker.js
+++ b/bindings/javascript/libwhisper.worker.js
@ -1 +0,0 @@
-"use strict";var Module={};var ENVIRONMENT_IS_NODE=typeof process=="object"&&typeof process.versions=="object"&&typeof process.versions.node=="string";if(ENVIRONMENT_IS_NODE){var nodeWorkerThreads=require("worker_threads");var parentPort=nodeWorkerThreads.parentPort;parentPort.on("message",data=>onmessage({data:data}));var fs=require("fs");Object.assign(global,{self:global,require:require,Module:Module,location:{href:__filename},Worker:nodeWorkerThreads.Worker,importScripts:function(f){(0,eval)(fs.readFileSync(f,"utf8")+"//# sourceURL="+f)},postMessage:function(msg){parentPort.postMessage(msg)},performance:global.performance||{now:function(){return Date.now()}}})}var initializedJS=false;var pendingNotifiedProxyingQueues=[];function threadPrintErr(){var text=Array.prototype.slice.call(arguments).join(" ");if(ENVIRONMENT_IS_NODE){fs.writeSync(2,text+"\n");return}console.error(text)}function threadAlert(){var text=Array.prototype.slice.call(arguments).join(" ");postMessage({cmd:"alert",text:text,threadId:Module["_pthread_self"]()})}var err=threadPrintErr;self.alert=threadAlert;Module["instantiateWasm"]=(info,receiveInstance)=>{var instance=new WebAssembly.Instance(Module["wasmModule"],info);receiveInstance(instance);Module["wasmModule"]=null;return instance.exports};self.onunhandledrejection=e=>{throw e.reason??e};self.onmessage=e=>{try{if(e.data.cmd==="load"){Module["wasmModule"]=e.data.wasmModule;for(const handler of e.data.handlers){Module[handler]=function(){postMessage({cmd:"callHandler",handler:handler,args:[...arguments]})}}Module["wasmMemory"]=e.data.wasmMemory;Module["buffer"]=Module["wasmMemory"].buffer;Module["ENVIRONMENT_IS_PTHREAD"]=true;if(typeof e.data.urlOrBlob=="string"){importScripts(e.data.urlOrBlob)}else{var objectUrl=URL.createObjectURL(e.data.urlOrBlob);importScripts(objectUrl);URL.revokeObjectURL(objectUrl)}whisper_factory(Module).then(function(instance){Module=instance})}else if(e.data.cmd==="run"){Module["__performance_now_clock_drift"]=performance.now()-e.data.time;Module["__emscripten_thread_init"](e.data.pthread_ptr,0,0,1);Module["establishStackSpace"]();Module["PThread"].receiveObjectTransfer(e.data);Module["PThread"].threadInitTLS();if(!initializedJS){Module["__embind_initialize_bindings"]();pendingNotifiedProxyingQueues.forEach(queue=>{Module["executeNotifiedProxyingQueue"](queue)});pendingNotifiedProxyingQueues=[];initializedJS=true}try{Module["invokeEntryPoint"](e.data.start_routine,e.data.arg)}catch(ex){if(ex!="unwind"){if(ex instanceof Module["ExitStatus"]){if(Module["keepRuntimeAlive"]()){}else{Module["__emscripten_thread_exit"](ex.status)}}else{throw ex}}}}else if(e.data.cmd==="cancel"){if(Module["_pthread_self"]()){Module["__emscripten_thread_exit"](-1)}}else if(e.data.target==="setimmediate"){}else if(e.data.cmd==="processProxyingQueue"){if(initializedJS){Module["executeNotifiedProxyingQueue"](e.data.queue)}else{pendingNotifiedProxyingQueues.push(e.data.queue)}}else if(e.data.cmd){err("worker.js received unknown command "+e.data.cmd);err(e.data)}}catch(ex){if(Module["__emscripten_thread_crashed"]){Module["__emscripten_thread_crashed"]()}throw ex}};
--- a/bindings/javascript/package-tmpl.json
+++ b/bindings/javascript/package-tmpl.json
@ -1,26 +0,0 @@
-{
-  "name": "whisper.cpp",
-  "version": "@PROJECT_VERSION@",
-  "description": "Whisper speech recognition",
-  "main": "whisper.js",
-  "scripts": {
-    "test": "echo \"todo: add tests\" && exit 0"
-  },
-  "repository": {
-    "type": "git",
-    "url": "git+https://github.com/ggerganov/whisper.cpp"
-  },
-  "keywords": [
-    "openai",
-    "whisper",
-    "speech-to-text",
-    "speech-recognition",
-    "transformer"
-  ],
-  "author": "Georgi Gerganov",
-  "license": "MIT",
-  "bugs": {
-    "url": "https://github.com/ggerganov/whisper.cpp/issues"
-  },
-  "homepage": "https://github.com/ggerganov/whisper.cpp#readme"
-}
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,26 +0,0 @@
-{
-  "name": "whisper.cpp",
-  "version": "1.0.3",
-  "description": "Whisper speech recognition",
-  "main": "whisper.js",
-  "scripts": {
-    "test": "echo \"todo: add tests\" && exit 0"
-  },
-  "repository": {
-    "type": "git",
-    "url": "git+https://github.com/ggerganov/whisper.cpp"
-  },
-  "keywords": [
-    "openai",
-    "whisper",
-    "speech-to-text",
-    "speech-recognition",
-    "transformer"
-  ],
-  "author": "Georgi Gerganov",
-  "license": "MIT",
-  "bugs": {
-    "url": "https://github.com/ggerganov/whisper.cpp/issues"
-  },
-  "homepage": "https://github.com/ggerganov/whisper.cpp#readme"
-}
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -23,11 +23,9 @@ if (EMSCRIPTEN)
    add_subdirectory(stream.wasm)
    add_subdirectory(command.wasm)
    add_subdirectory(talk.wasm)
-    add_subdirectory(bench.wasm)
 else()
    add_subdirectory(main)
    add_subdirectory(stream)
    add_subdirectory(command)
    add_subdirectory(bench)
-    add_subdirectory(talk)
 endif()
--- a/examples/bench.wasm/CMakeLists.txt
+++ b/examples/bench.wasm/CMakeLists.txt
@ -1,47 +0,0 @@
-#
-# libbench
-#
-
-set(TARGET libbench)
-
-add_executable(${TARGET}
-    emscripten.cpp
-    )
-
-target_link_libraries(${TARGET} PRIVATE
-    whisper
-    )
-
-unset(EXTRA_FLAGS)
-
-if (WHISPER_WASM_SINGLE_FILE)
-    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
-    message(STATUS "Embedding WASM inside bench.js")
-
-    add_custom_command(
-        TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy
-        ${CMAKE_BINARY_DIR}/bin/libbench.js
-        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/bench.wasm/bench.js
-        )
-endif()
-
-set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
-    --bind \
-    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1024MB \
-    -s TOTAL_MEMORY=1024MB \
-    -s FORCE_FILESYSTEM=1 \
-    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
-    ${EXTRA_FLAGS} \
-    ")
-
-#
-# bench.wasm
-#
-
-set(TARGET bench.wasm)
-
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
--- a/examples/bench.wasm/README.md
+++ b/examples/bench.wasm/README.md
@ -1,22 +0,0 @@
-# bench.wasm
-
-Benchmark the performance of whisper.cpp in the browser using WebAssembly
-
-Link: https://whisper.ggerganov.com/bench/
-
-Terminal version: [examples/bench](/examples/bench)
-
-## Build instructions
-
-```bash
-# build using Emscripten (v3.1.2)
-git clone https://github.com/ggerganov/whisper.cpp
-cd whisper.cpp
-mkdir build-em && cd build-em
-emcmake cmake ..
-make -j
-
-# copy the produced page to your HTTP path
-cp bin/bench.wasm/*       /path/to/html/
-cp bin/libbench.worker.js /path/to/html/
-```
--- a/examples/bench.wasm/emscripten.cpp
+++ b/examples/bench.wasm/emscripten.cpp
@ -1,80 +0,0 @@
-#include "whisper.h"
-
-#include <emscripten.h>
-#include <emscripten/bind.h>
-
-#include <cmath>
-#include <string>
-#include <thread>
-#include <vector>
-
-constexpr int N_THREAD = 8;
-
-// TODO: get rid of this vector of contexts - bad idea in the first place
-std::vector<struct whisper_context *> g_contexts(4, nullptr);
-
-std::thread g_worker;
-
-void bench_main(size_t index) {
-    const int n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
-
-    // whisper context
-    auto & ctx = g_contexts[index];
-
-    fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);
-
-    if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
-        fprintf(stderr, "error: failed to set mel: %d\n", ret);
-        return;
-    }
-
-    if (int ret = whisper_encode(ctx, 0, n_threads) != 0) {
-        fprintf(stderr, "error: failed to encode model: %d\n", ret);
-        return;
-    }
-
-    whisper_print_timings(ctx);
-
-    fprintf(stderr, "\n");
-    fprintf(stderr, "If you wish, you can submit these results here:\n");
-    fprintf(stderr, "\n");
-    fprintf(stderr, "  https://github.com/ggerganov/whisper.cpp/issues/89\n");
-    fprintf(stderr, "\n");
-    fprintf(stderr, "Please include the following information:\n");
-    fprintf(stderr, "\n");
-    fprintf(stderr, "  - CPU model\n");
-    fprintf(stderr, "  - Operating system\n");
-    fprintf(stderr, "  - Browser\n");
-    fprintf(stderr, "\n");
-}
-
-EMSCRIPTEN_BINDINGS(bench) {
-    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
-        for (size_t i = 0; i < g_contexts.size(); ++i) {
-            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
-                if (g_contexts[i] != nullptr) {
-                    if (g_worker.joinable()) {
-                        g_worker.join();
-                    }
-                    g_worker = std::thread([i]() {
-                        bench_main(i);
-                    });
-
-                    return i + 1;
-                } else {
-                    return (size_t) 0;
-                }
-            }
-        }
-
-        return (size_t) 0;
-    }));
-
-    emscripten::function("free", emscripten::optional_override([](size_t index) {
-        if (index < g_contexts.size()) {
-            whisper_free(g_contexts[index]);
-            g_contexts[index] = nullptr;
-        }
-    }));
-}
--- a/examples/bench.wasm/index-tmpl.html
+++ b/examples/bench.wasm/index-tmpl.html
@ -1,227 +0,0 @@
-<!doctype html>
-<html lang="en-us">
-    <head>
-        <title>bench : Benchmark whisper.cpp performance in the browser</title>
-
-        <style>
-            #output {
-                width: 100%;
-                height: 100%;
-                margin: 0 auto;
-                margin-top: 10px;
-                border-left: 0px;
-                border-right: 0px;
-                padding-left: 0px;
-                padding-right: 0px;
-                display: block;
-                background-color: black;
-                color: white;
-                font-size: 10px;
-                font-family: 'Lucida Console', Monaco, monospace;
-                outline: none;
-                white-space: pre;
-                overflow-wrap: normal;
-                overflow-x: scroll;
-            }
-        </style>
-    </head>
-    <body>
-        <div id="main-container">
-            <b>bench : Benchmark whisper.cpp performance in the browser</b>
-
-            <br><br>
-
-            You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/bench.wasm">GitHub</a>.
-
-            <br><br>
-
-            <hr>
-
-            Select the model you would like to use and click the "Bench" button.<br>
-            The results will be displayed in the textarea below.
-
-            <br><br>
-
-            <div id="model-whisper">
-                Whisper model: <span id="model-whisper-status"></span>
-                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
-                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
-                <span id="fetch-whisper-progress"></span>
-
-                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
-            </div>
-
-            <br>
-
-            <div id="input">
-                <button id="bench" onclick="onBench()" disabled>Bench</button>
-                <button id="clear" onclick="clearCache()">Clear Cache</button>
-            </div>
-
-            <hr>
-
-            Debug output:
-            <textarea id="output" rows="20"></textarea>
-
-            <br>
-
-            <b>Troubleshooting</b>
-
-            <br><br>
-
-            The page does some heavy computations, so make sure:
-
-            <ul>
-                <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
-                <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
-                <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
-            </ul>
-
-            <div class="cell-version">
-                <span>
-                    |
-                    Build time: <span class="nav-link">@GIT_DATE@</span> |
-                    Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
-                    Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
-                    <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/bench.wasm">Source Code</a> |
-                </span>
-            </div>
-        </div>
-
-        <script type="text/javascript" src="helpers.js"></script>
-        <script type='text/javascript'>
-            // the bench instance
-            var instance = null;
-
-            // model name
-            var model_whisper = null;
-
-            var Module = {
-                print: printTextarea,
-                printErr: printTextarea,
-                setStatus: function(text) {
-                    printTextarea('js: ' + text);
-                },
-                monitorRunDependencies: function(left) {
-                },
-                preRun: function() {
-                    printTextarea('js: Preparing ...');
-                },
-                postRun: function() {
-                    printTextarea('js: Initialized successfully!');
-                }
-            };
-
-            //
-            // fetch models
-            //
-
-            let dbVersion = 1
-            let dbName    = 'whisper.ggerganov.com';
-            let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
-
-            function storeFS(fname, buf) {
-                // write to WASM file using FS_createDataFile
-                // if the file exists, delete it
-                try {
-                    Module.FS_unlink(fname);
-                } catch (e) {
-                    // ignore
-                }
-
-                Module.FS_createDataFile("/", fname, buf, true, true);
-
-                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
-
-                model_whisper = fname;
-
-                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
-
-                if (model_whisper != null) {
-                    document.getElementById('bench').disabled = false;
-                }
-            }
-
-            function loadFile(event, fname) {
-                var file = event.target.files[0] || null;
-                if (file == null) {
-                    return;
-                }
-
-                printTextarea("loadFile: loading model: " + file.name + ", size: " + file.size + " bytes");
-                printTextarea('loadFile: please wait ...');
-
-                var reader = new FileReader();
-                reader.onload = function(event) {
-                    var buf = new Uint8Array(reader.result);
-                    storeFS(fname, buf);
-                }
-                reader.readAsArrayBuffer(file);
-
-                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
-                document.getElementById('fetch-whisper-base-en').style.display = 'none';
-                document.getElementById('whisper-file'         ).style.display = 'none';
-                document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
-            }
-
-            function loadWhisper(model) {
-                let urls = {
-                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
-                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
-                };
-
-                let sizes = {
-                    'tiny.en': 75,
-                    'base.en': 142,
-                };
-
-                let url     = urls[model];
-                let dst     = 'whisper.bin';
-                let size_mb = sizes[model];
-
-                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
-                document.getElementById('fetch-whisper-base-en').style.display = 'none';
-                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
-
-                cbProgress = function(p) {
-                    let el = document.getElementById('fetch-whisper-progress');
-                    el.innerHTML = Math.round(100*p) + '%';
-                };
-
-                cbCancel = function() {
-                    var el;
-                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
-                };
-
-                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
-            }
-
-            //
-            // main
-            //
-
-            function onBench() {
-                if (instance) {
-                    Module.free(instance);
-                }
-
-                instance = Module.init('whisper.bin');
-
-                if (instance) {
-                    printTextarea("js: whisper initialized, instance: " + instance);
-                }
-
-                document.getElementById('bench').disabled = true;
-
-                if (!instance) {
-                    printTextarea("js: failed to initialize whisper");
-                    return;
-                }
-            }
-
-        </script>
-        <script type="text/javascript" src="bench.js"></script>
-    </body>
-</html>
--- a/examples/bench/README.md
+++ b/examples/bench/README.md
@ -1,8 +1,6 @@
 # bench

-A very basic tool for benchmarking the inference performance on your device. The tool simply runs the Encoder part of
-the transformer on some random audio data and records the execution time. This way we can have an objective comparison
-of the performance of the model for various setups.
+A very basic tool for benchmarking the inference performance on your device. The tool simply runs the Encoder part of the transformer on some random audio data and records the execution time. This way we can have an objective comparison of the performance of the model for various setups.

 Benchmark results are tracked in the following Github issue: https://github.com/ggerganov/whisper.cpp/issues/89

--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -34,6 +34,7 @@ struct whisper_params {

    bool speed_up      = false;
    bool translate     = false;
+    bool no_context    = true;
    bool print_special = false;
    bool print_energy  = false;
    bool no_timestamps = true;
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@ -1,33 +1,15 @@
 #!/bin/bash
-#
+set -eo pipefail
 # Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals
 # Idea by @semiformal-net
 # ref: https://github.com/ggerganov/whisper.cpp/issues/185
 #

-set -eo pipefail
-
 url="http://a.files.bbci.co.uk/media/live/manifesto/audio/simulcast/hls/nonuk/sbr_low/ak/bbc_world_service.m3u8"
 fmt=aac # the audio format extension of the stream (TODO: auto detect)
 step_s=30
 model="base.en"

-check_requirements()
-{
-    if ! command -v ./main &>/dev/null; then
-        echo "whisper.cpp main executable is required (make)"
-        exit 1
-    fi
-
-    if ! command -v ffmpeg &>/dev/null; then
-        echo "ffmpeg is required (https://ffmpeg.org)"
-        exit 1
-    fi
-}
-
-check_requirements
-
-
 if [ -z "$1" ]; then
    echo "Usage: $0 stream_url [step_s] [model]"
    echo ""
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -266,7 +266,7 @@ bool output_txt(struct whisper_context * ctx, const char * fname) {
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
-        fout << text << "\n";
+        fout << text;
    }

    return true;
--- a/examples/talk.wasm/README.md
+++ b/examples/talk.wasm/README.md
@ -6,8 +6,6 @@ Talk with an Artificial Intelligence in your browser:

 Online demo: https://whisper.ggerganov.com/talk/

-Terminal version: [examples/talk](/examples/talk)
-
 ## How it works?

 This demo leverages 2 modern neural network models to create a high-quality voice chat directly in your browser:
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -325,9 +325,10 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = NULL;
+        struct ggml_init_params params = {
+            .mem_size   = ctx_size,
+            .mem_buffer = NULL,
+        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -528,14 +529,13 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
+    struct ggml_init_params params = {
+        .mem_size   = buf_size,
+        .mem_buffer = buf,
+    };

    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_cgraph gf = { };
-    gf.n_threads = n_threads;
+    struct ggml_cgraph gf = { .n_threads = n_threads };

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
--- a/examples/talk/.gitignore
+++ b/examples/talk/.gitignore
@ -1 +0,0 @@
-eleven-labs.py
--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@ -1,13 +0,0 @@
-if (WHISPER_SUPPORT_SDL2)
-    # talk
-    set(TARGET talk)
-    #add_executable(${TARGET} talk.cpp gpt-2.cpp)
-    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
-    #target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-
-    # TODO: this is temporary
-    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk.cpp gpt-2.cpp ../../ggml.c ../../whisper.cpp)
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-endif ()
--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -1,41 +0,0 @@
-# talk
-
-Talk with an Artificial Intelligence in your terminal
-
-[Demo Talk](https://user-images.githubusercontent.com/1991296/206805012-48e71cc2-588d-4745-8798-c1c70ea3b40d.mp4)
-
-Web version: [examples/talk.wasm](/examples/talk.wasm)
-
-## Building
-
-The `talk` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
-
-```bash
-# Install SDL2 on Linux
-sudo apt-get install libsdl2-dev
-
-# Install SDL2 on Mac OS
-brew install sdl2
-
-# Build the "talk" executable
-make talk
-
-# Run it
-./talk -p Santa
-```
-
-## GPT-2
-
-To run this, you will need a ggml GPT-2 model: [instructions](https://github.com/ggerganov/ggml/tree/master/examples/gpt-2#downloading-and-converting-the-original-models)
-
-Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:
-
-```
-wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://ggml.ggerganov.com/ggml-model-gpt-2-117M.bin
-```
-
-## TTS
-
-For best experience, this example needs a TTS tool to convert the generated text responses to voice.
-You can use any TTS engine that you would like - simply edit the [speak.sh](speak.sh) script to your needs.
-By default, it is configured to use `espeak`, but you can use whatever you wish.
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -1,925 +0,0 @@
-#include "ggml.h"
-#include "gpt-2.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-#include <random>
-
-/////////////////////// GPT-2 BEGIN /////////////////////////
-
-//
-// Vocab utils
-//
-
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-
-        std::regex re(pat);
-        std::smatch m;
-
-        while (std::regex_search(str, m, re)) {
-            for (auto x : m) {
-                words.push_back(x);
-            }
-            str = m.suffix();
-        }
-    }
-
-    // find the longest tokens that form the words:
-    std::vector<gpt_vocab::id> tokens;
-    for (const auto & word : words) {
-        if (word.size() == 0) continue;
-
-        int i = 0;
-        int n = word.size();
-        while (i < n) {
-            int j = n;
-            while (j > i) {
-                auto it = vocab.token_to_id.find(word.substr(i, j-i));
-                if (it != vocab.token_to_id.end()) {
-                    tokens.push_back(it->second);
-                    i = j;
-                    break;
-                }
-                --j;
-            }
-            if (i == n) {
-                break;
-            }
-            if (j == i) {
-                auto sub = word.substr(i, 1);
-                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
-                    tokens.push_back(vocab.token_to_id.at(sub));
-                } else {
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
-                }
-                ++i;
-            }
-        }
-    }
-
-    return tokens;
-}
-
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const gpt_vocab & vocab,
-        const float * logits,
-        int    top_k,
-        double top_p,
-        double temp,
-        std::mt19937 & rng) {
-    int n_logits = vocab.id_to_token.size();
-
-    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    for (int i = 0; i < n_logits; i++) {
-        logits_id.push_back(std::make_pair(logits[i], i));
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    // normalize
-    {
-        double sum = 0.0f;
-        for (int i = 0; i < (int)logits_id.size(); i++) {
-            sum += logits_id[i].first;
-        }
-
-        sum = 1.0/sum;
-        for (int i = 0; i < (int)logits_id.size(); i++) {
-            logits_id[i].first *= sum;
-        }
-    }
-
-    if (top_p < 1.0f) {
-        {
-            double cumsum = 0.0f;
-            for (int i = 0; i < top_k; i++) {
-                cumsum += logits_id[i].first;
-                if (cumsum >= top_p) {
-                    logits_id.resize(i+1);
-                    break;
-                }
-            }
-        }
-
-        // normalize again
-        {
-            double sum = 0.0f;
-            for (int i = 0; i < (int)logits_id.size(); i++) {
-                sum += logits_id[i].first;
-            }
-
-            sum = 1.0/sum;
-            for (int i = 0; i < (int)logits_id.size(); i++) {
-                logits_id[i].first *= sum;
-            }
-        }
-    }
-
-    //printf("\n");
-    //for (int i = 0; i < (int)logits_id.size(); i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
-    //}
-    //exit(0);
-
-    // sample from the obtained distribution
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    for (int i = 0; i < (int) logits_id.size(); i++) {
-        probs.push_back(logits_id[i].first);
-    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-}
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
-    int32_t n_vocab = 50257;
-    int32_t n_ctx   = 1024;
-    int32_t n_embd  = 768;
-    int32_t n_head  = 12;
-    int32_t n_layer = 12;
-    int32_t f16     = 1;
-};
-
-struct gpt2_layer {
-    // normalization
-    struct ggml_tensor * ln_1_g;
-    struct ggml_tensor * ln_1_b;
-
-    struct ggml_tensor * ln_2_g;
-    struct ggml_tensor * ln_2_b;
-
-    // attention
-    struct ggml_tensor * c_attn_attn_w;
-    struct ggml_tensor * c_attn_attn_b;
-
-    struct ggml_tensor * c_attn_proj_w;
-    struct ggml_tensor * c_attn_proj_b;
-
-    // mlp
-    struct ggml_tensor * c_mlp_fc_w;
-    struct ggml_tensor * c_mlp_fc_b;
-
-    struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
-    struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_model {
-    gpt2_hparams hparams;
-
-    // normalization
-    struct ggml_tensor * ln_f_g;
-    struct ggml_tensor * ln_f_b;
-
-    struct ggml_tensor * wte; // position embedding
-    struct ggml_tensor * wpe; //    token embedding
-
-    std::vector<gpt2_layer> layers;
-
-    // key + value memory
-    struct ggml_tensor * memory_k;
-    struct ggml_tensor * memory_v;
-
-    //
-    struct ggml_context * ctx;
-    std::map<std::string, struct ggml_tensor *> tensors;
-};
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
-    printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
-    auto fin = std::ifstream(fname, std::ios::binary);
-    if (!fin) {
-        fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
-        return false;
-    }
-
-    // verify magic
-    {
-        uint32_t magic;
-        fin.read((char *) &magic, sizeof(magic));
-        if (magic != 0x67676d6c) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
-            return false;
-        }
-    }
-
-    // load hparams
-    {
-        auto & hparams = model.hparams;
-
-        fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
-        fin.read((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
-        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
-        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
-        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
-
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
-    }
-
-    // load vocab
-    {
-        int32_t n_vocab = 0;
-        fin.read((char *) &n_vocab, sizeof(n_vocab));
-
-        if (n_vocab != model.hparams.n_vocab) {
-            fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
-                    __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
-            return false;
-        }
-
-        std::string word;
-        for (int i = 0; i < n_vocab; i++) {
-            uint32_t len;
-            fin.read((char *) &len, sizeof(len));
-
-            word.resize(len);
-            fin.read((char *) word.data(), len);
-
-            vocab.token_to_id[word] = i;
-            vocab.id_to_token[i] = word;
-        }
-    }
-
-    // for the big tensors, we have the option to store the data in 16-bit floats
-    // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
-
-    auto & ctx = model.ctx;
-
-    size_t ctx_size = 0;
-
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
-
-        ctx_size += n_vocab*n_embd*ggml_type_size(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
-
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
-
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
-
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
-
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_size(GGML_TYPE_F32));   // c_attn_proj_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
-
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
-
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
-
-        ctx_size += (6 + 12*n_layer)*256; // object overhead
-
-        printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
-    }
-
-    // create the ggml context
-    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = NULL;
-
-        model.ctx = ggml_init(params);
-        if (!model.ctx) {
-            fprintf(stderr, "%s: ggml_init() failed\n", __func__);
-            return false;
-        }
-    }
-
-    // prepare memory for the weights
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-        const int n_vocab = hparams.n_vocab;
-
-        model.layers.resize(n_layer);
-
-        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
-        model.wte = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
-
-        // map by name
-        model.tensors["model/ln_f/g"] = model.ln_f_g;
-        model.tensors["model/ln_f/b"] = model.ln_f_b;
-
-        model.tensors["model/wte"] = model.wte;
-        model.tensors["model/wpe"] = model.wpe;
-
-        for (int i = 0; i < n_layer; ++i) {
-            auto & layer = model.layers[i];
-
-            layer.ln_1_g             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.ln_2_g             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
-            layer.c_attn_attn_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
-            layer.c_attn_proj_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_fc_b         = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
-            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-
-            // map by name
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_1/b"]        = layer.ln_1_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/g"]        = layer.ln_2_g;
-            model.tensors["model/h" + std::to_string(i) + "/ln_2/b"]        = layer.ln_2_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
-            model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
-        }
-    }
-
-    // key + value memory
-    {
-        const auto & hparams = model.hparams;
-
-        const int n_embd  = hparams.n_embd;
-        const int n_layer = hparams.n_layer;
-        const int n_ctx   = hparams.n_ctx;
-
-        const int n_mem      = n_layer*n_ctx;
-        const int n_elements = n_embd*n_mem;
-
-        model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-        model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
-        const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
-        printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-    }
-
-    // load weights
-    {
-        size_t total_size = 0;
-
-        while (true) {
-            int32_t n_dims;
-            int32_t length;
-            int32_t ftype;
-
-            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
-            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
-
-            if (fin.eof()) {
-                break;
-            }
-
-            int32_t nelements = 1;
-            int32_t ne[2] = { 1, 1 };
-            for (int i = 0; i < n_dims; ++i) {
-                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
-                nelements *= ne[i];
-            }
-
-            std::string name(length, 0);
-            fin.read(&name[0], length);
-
-            if (model.tensors.find(name.data()) == model.tensors.end()) {
-                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
-                return false;
-            }
-
-            auto tensor = model.tensors[name.data()];
-            if (ggml_nelements(tensor) != nelements) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
-                return false;
-            }
-
-            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
-                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
-                return false;
-            }
-
-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
-
-            if (nelements*bpe != ggml_nbytes(tensor)) {
-                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
-                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
-                return false;
-            }
-
-            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-
-            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
-            total_size += ggml_nbytes(tensor);
-        }
-
-        printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
-    }
-
-    fin.close();
-
-    return true;
-}
-
-// evaluate the transformer
-//
-//   - model:     the model
-//   - n_threads: number of threads to use
-//   - n_past:    the context size so far
-//   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted probabilities of the next token
-//
-bool gpt2_eval(
-        const gpt2_model & model,
-        const int n_threads,
-        const int n_past,
-        const std::vector<gpt_vocab::id> & embd_inp,
-              std::vector<float>         & embd_w,
-              size_t                     & mem_per_token) {
-    const int N = embd_inp.size();
-
-    const auto & hparams = model.hparams;
-
-    const int n_embd  = hparams.n_embd;
-    const int n_layer = hparams.n_layer;
-    const int n_ctx   = hparams.n_ctx;
-    const int n_head  = hparams.n_head;
-    const int n_vocab = hparams.n_vocab;
-
-    static size_t buf_size = 5640ull*1024*1024;
-    static void * buf = malloc(buf_size);
-
-    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
-        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
-
-        // reallocate
-        buf_size = buf_size_new;
-        buf = realloc(buf, buf_size);
-        if (buf == nullptr) {
-            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, buf_size);
-            return false;
-        }
-    }
-
-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
-
-    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_cgraph gf = { };
-    gf.n_threads = n_threads;
-
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
-    for (int i = 0; i < N; ++i) {
-        ((int32_t *) position->data)[i] = n_past + i;
-    }
-
-    // wte + wpe
-    struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
-
-    for (int il = 0; il < n_layer; ++il) {
-        struct ggml_tensor * cur;
-
-        // norm
-        {
-            // [ 768, N]
-            cur = ggml_norm(ctx0, inpL);
-
-            // cur = ln_1_g*cur + ln_1_b
-            // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
-                        cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
-        }
-
-        // attn
-        // [2304, 768] - model.layers[il].c_attn_attn_w
-        // [2304,   1] - model.layers[il].c_attn_attn_b
-        // [ 768,   N] - cur (in)
-        // [2304,   N] - cur (out)
-        //
-        // cur = attn_w*cur + attn_b
-        // [2304, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
-                    cur);
-        }
-
-        // self-attention
-        {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
-            // store key and value to memory
-            if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
-            }
-
-            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
-            // [64, N, 12]
-            struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
-                        0, 2, 1, 3);
-
-            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
-            // [64, n_past + N, 12]
-            struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        0, 2, 1, 3);
-
-            // GG: flash attention
-            //struct ggml_tensor * V =
-            //    ggml_cpy(ctx0,
-            //            ggml_permute(ctx0,
-            //                ggml_reshape_3d(ctx0,
-            //                    ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-            //                    n_embd/n_head, n_head, n_past + N),
-            //                1, 2, 0, 3),
-            //            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
-            //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
-            // K * Q
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
-
-            // KQ_scaled = KQ / sqrt(n_embd/n_head)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
-                        KQ,
-                        ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
-                        );
-
-            // KQ_masked = mask_past(KQ_scaled)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
-
-            // KQ = soft_max(KQ_masked)
-            // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
-
-            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
-            // [n_past + N, 64, 12]
-            struct ggml_tensor * V_trans =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
-
-            // KQV = transpose(V) * KQ_soft_max
-            // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
-            // KQV_merged = KQV.permute(0, 2, 1, 3)
-            // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
-
-            // cur = KQV_merged.contiguous().view(n_embd, N)
-            // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
-        }
-
-        // projection
-        // [ 768, 768] - model.layers[il].c_attn_proj_w
-        // [ 768,   1] - model.layers[il].c_attn_proj_b
-        // [ 768,   N] - cur (in)
-        // [ 768,   N] - cur (out)
-        //
-        // cur = proj_w*cur + proj_b
-        // [768, N]
-        {
-            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
-                    cur);
-        }
-
-        // add the input
-        cur = ggml_add(ctx0, cur, inpL);
-
-        struct ggml_tensor * inpFF = cur;
-
-        // feed-forward network
-        {
-            // norm
-            {
-                cur = ggml_norm(ctx0, inpFF);
-
-                // cur = ln_2_g*cur + ln_2_b
-                // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
-                            cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
-            }
-
-            // fully connected
-            // [3072, 768] - model.layers[il].c_mlp_fc_w
-            // [3072,   1] - model.layers[il].c_mlp_fc_b
-            // [ 768,   N] - cur (in)
-            // [3072,   N] - cur (out)
-            //
-            // cur = fc_w*cur + fc_b
-            // [3072, N]
-            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
-                    cur);
-
-            // GELU activation
-            // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
-
-            // projection
-            // [ 768, 3072] - model.layers[il].c_mlp_proj_w
-            // [ 768,    1] - model.layers[il].c_mlp_proj_b
-            // [3072,    N] - cur (in)
-            // [ 768,    N] - cur (out)
-            //
-            // cur = proj_w*cur + proj_b
-            // [768, N]
-            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w_trans,
-                    cur);
-
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
-                    cur);
-        }
-
-        // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
-    }
-
-    // norm
-    {
-        // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL);
-
-        // inpL = ln_f_g*inpL + ln_f_b
-        // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
-                    inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
-    }
-
-    // inpL = WTE * inpL
-    // [ 768, 50257] - model.wte
-    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.wte, inpL);
-
-    // logits -> probs
-    inpL = ggml_soft_max(ctx0, inpL);
-
-    // run the computation
-    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
-
-    //if (n_past%100 == 0) {
-    //    ggml_graph_print   (&gf);
-    //    ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
-    //}
-
-    //embd_w.resize(n_vocab*N);
-    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-
-    // return result for just the last token
-    embd_w.resize(n_vocab);
-    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
-
-    if (mem_per_token == 0) {
-        mem_per_token = ggml_used_mem(ctx0)/N;
-    }
-    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
-
-    ggml_free(ctx0);
-
-    return true;
-}
-
-/////////////////////////////// GPT-2 END ////////////////////////////////
-
-constexpr int N_THREAD = 8;
-
-struct gpt2_context {
-    std::string prompt_base = R"(Hello, how are you?
-I'm fine, thanks. How are you?
-Thanks, I'm fine too. What are you doing?
-I'm just sitting here.
-It's a lovely day, isn't it?
-Yes, it is. I love the weather this time of year.
-I wish it would rain a little bit.
-Me too.
-)";
-
-    std::mt19937 rng;
-
-    gpt_vocab vocab;
-    gpt2_model model;
-
-    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
-
-    // sampling parameters
-    int32_t top_k = 20;
-    float   top_p = 0.98f;
-    float   temp  = 1.0f;
-};
-
-struct gpt2_context * gpt2_init(const char * path_model) {
-    gpt2_context * ctx = new gpt2_context;
-
-    ctx->rng = std::mt19937(time(NULL));
-
-    // load the model
-    {
-        const int64_t t_start_us = ggml_time_us();
-
-        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
-            return nullptr;
-        }
-
-        const int64_t t_load_us = ggml_time_us() - t_start_us;
-
-        printf("gpt-2: model loaded in %d ms\n", (int) (t_load_us/1000));
-    }
-
-    return ctx;
-}
-
-void gpt2_free(struct gpt2_context * ctx) {
-    delete ctx;
-}
-
-const char * gpt2_get_prompt(struct gpt2_context * ctx) {
-    return ctx->prompt_base.c_str();
-}
-
-void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt) {
-    ctx->prompt_base = prompt;
-}
-
-std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text) {
-    return ::gpt_tokenize(ctx->vocab, text);
-}
-
-std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens) {
-    int n_past = 0;
-
-    std::vector<float> embd_w;
-
-    // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::gpt2_tokenize(ctx, text);
-
-    int n_predict = std::min(max_tokens, ctx->model.hparams.n_ctx - (int) embd_inp.size());
-
-    std::vector<gpt_vocab::id> embd = embd_inp;
-
-    size_t mem_per_token = 3000000;
-
-    std::string result;
-
-    for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
-        // predict
-        if (embd.size() > 0) {
-            if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
-                printf("gpt-2: failed to generate text\n");
-                return "";
-            }
-        }
-
-        n_past += embd.size();
-        embd.clear();
-
-        {
-            // sample next token
-            const int   top_k = ctx->top_k;
-            const float top_p = ctx->top_p;
-            const float temp  = ctx->temp;
-
-            const int n_vocab = ctx->model.hparams.n_vocab;
-
-            const gpt_vocab::id id = gpt_sample_top_k_top_p(ctx->vocab, embd_w.data() + (embd_w.size() - n_vocab), top_k, top_p, temp, ctx->rng);
-
-            // add it to the context
-            embd.push_back(id);
-        }
-
-        result += ctx->vocab.id_to_token[embd[0]];
-
-        // end of text token
-        if (embd.back() == 50256 ||
-            ctx->vocab.id_to_token[embd.back()] == "." ||
-            ctx->vocab.id_to_token[embd.back()] == "!" ||
-            ctx->vocab.id_to_token[embd.back()] == "?") {
-            break;
-        }
-    }
-
-    return result;
-}
--- a/examples/talk/gpt-2.h
+++ b/examples/talk/gpt-2.h
@ -1,27 +0,0 @@
-#pragma once
-
-// TODO: Change to C-style API and move to ./examples for easy reuse.
-
-#include <vector>
-#include <map>
-#include <string>
-
-struct gpt_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-};
-
-struct gpt2_context;
-
-struct gpt2_context * gpt2_init(const char * path_model);
-void gpt2_free(struct gpt2_context * ctx);
-
-const char * gpt2_get_prompt(struct gpt2_context * ctx);
-void gpt2_set_prompt(struct gpt2_context * ctx, const char * prompt);
-
-std::vector<gpt_vocab::id> gpt2_tokenize(const gpt2_context * ctx, const char * text);
-
-std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens);
--- a/examples/talk/speak.sh
+++ b/examples/talk/speak.sh
@ -1,17 +0,0 @@
-#!/bin/bash
-
-# Usage:
-#  speak.sh <voice_id> <text-to-speak>
-
-# espeak
-# Mac OS: brew install espeak
-# Linux: apt-get install espeak
-#
-espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
-
-# Eleven Labs
-#
-#wd=$(dirname $0)
-#script=$wd/eleven-labs.py
-#python3 $script $1 "$2"
-#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -1,733 +0,0 @@
-// Talk with AI
-//
-
-#include "whisper.h"
-#include "gpt-2.h"
-
-#include <SDL.h>
-#include <SDL_audio.h>
-
-#include <cassert>
-#include <cstdio>
-#include <fstream>
-#include <mutex>
-#include <regex>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-
-// command-line parameters
-struct whisper_params {
-    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t voice_ms   = 10000;
-    int32_t capture_id = -1;
-    int32_t max_tokens = 32;
-    int32_t audio_ctx  = 0;
-
-    float vad_thold    = 0.6f;
-    float freq_thold   = 100.0f;
-
-    bool speed_up      = false;
-    bool translate     = false;
-    bool print_special = false;
-    bool print_energy  = false;
-    bool no_timestamps = true;
-
-    std::string person    = "Santa";
-    std::string language  = "en";
-    std::string model_wsp = "models/ggml-base.en.bin";
-    std::string model_gpt = "models/ggml-gpt-2-117M.bin";
-    std::string speak     = "./examples/talk/speak.sh";
-    std::string fname_out = "";
-};
-
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
-        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms      = std::stoi(argv[++i]); }
-        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
-        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
-        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
-        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
-        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
-        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
-        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
-        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
-        else if (arg == "-p"   || arg == "--person")        { params.person        = argv[++i]; }
-        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
-        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp     = argv[++i]; }
-        else if (arg == "-mg"  || arg == "--model-gpt")     { params.model_gpt     = argv[++i]; }
-        else if (arg == "-s"   || arg == "--speak")         { params.speak         = argv[++i]; }
-        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -vms N,   --voice-ms N    [%-7d] voice duration in milliseconds\n",              params.voice_ms);
-    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
-    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
-    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
-    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
-    fprintf(stderr, "  -p NAME,  --person NAME   [%-7s] person name (for prompt selection)\n",          params.person.c_str());
-    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
-    fprintf(stderr, "  -mg FILE, --model-gpt     [%-7s] gpt model file\n",                              params.model_gpt.c_str());
-    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
-    fprintf(stderr, "\n");
-}
-
-//
-// SDL Audio capture
-//
-
-class audio_async {
-public:
-    audio_async(int len_ms);
-    ~audio_async();
-
-    bool init(int capture_id, int sample_rate);
-
-    // start capturing audio via the provided SDL callback
-    // keep last len_ms seconds of audio in a circular buffer
-    bool resume();
-    bool pause();
-    bool clear();
-
-    // callback to be called by SDL
-    void callback(uint8_t * stream, int len);
-
-    // get audio data from the circular buffer
-    void get(int ms, std::vector<float> & audio);
-
-private:
-    SDL_AudioDeviceID m_dev_id_in = 0;
-
-    int m_len_ms = 0;
-    int m_sample_rate = 0;
-
-    bool       m_running = false;
-    std::mutex m_mutex;
-
-    std::vector<float> m_audio;
-    std::vector<float> m_audio_new;
-    size_t             m_audio_pos = 0;
-    size_t             m_audio_len = 0;
-};
-
-audio_async::audio_async(int len_ms) {
-    m_len_ms = len_ms;
-}
-
-audio_async::~audio_async() {
-    if (m_dev_id_in) {
-        SDL_CloseAudioDevice(m_dev_id_in);
-    }
-}
-
-bool audio_async::init(int capture_id, int sample_rate) {
-    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
-
-    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
-        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
-        return false;
-    }
-
-    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
-
-    {
-        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
-        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
-        for (int i = 0; i < nDevices; i++) {
-            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
-        }
-    }
-
-    SDL_AudioSpec capture_spec_requested;
-    SDL_AudioSpec capture_spec_obtained;
-
-    SDL_zero(capture_spec_requested);
-    SDL_zero(capture_spec_obtained);
-
-    capture_spec_requested.freq     = sample_rate;
-    capture_spec_requested.format   = AUDIO_F32;
-    capture_spec_requested.channels = 1;
-    capture_spec_requested.samples  = 1024;
-    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
-        audio_async * audio = (audio_async *) userdata;
-        audio->callback(stream, len);
-    };
-    capture_spec_requested.userdata = this;
-
-    if (capture_id >= 0) {
-        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
-        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    } else {
-        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
-        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
-    }
-
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
-        m_dev_id_in = 0;
-
-        return false;
-    } else {
-        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
-        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
-        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
-                capture_spec_requested.format);
-        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
-                capture_spec_requested.channels);
-        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
-        fprintf(stderr, "\n");
-    }
-
-    m_sample_rate = capture_spec_obtained.freq;
-
-    m_audio.resize((m_sample_rate*m_len_ms)/1000);
-
-    return true;
-}
-
-bool audio_async::resume() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
-        return false;
-    }
-
-    if (m_running) {
-        fprintf(stderr, "%s: already running!\n", __func__);
-        return false;
-    }
-
-    SDL_PauseAudioDevice(m_dev_id_in, 0);
-
-    m_running = true;
-
-    return true;
-}
-
-bool audio_async::pause() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
-        return false;
-    }
-
-    if (!m_running) {
-        fprintf(stderr, "%s: already paused!\n", __func__);
-        return false;
-    }
-
-    SDL_PauseAudioDevice(m_dev_id_in, 1);
-
-    m_running = false;
-
-    return true;
-}
-
-bool audio_async::clear() {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
-        return false;
-    }
-
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return false;
-    }
-
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-
-        m_audio_pos = 0;
-        m_audio_len = 0;
-    }
-
-    return true;
-}
-
-// callback to be called by SDL
-void audio_async::callback(uint8_t * stream, int len) {
-    if (!m_running) {
-        return;
-    }
-
-    const size_t n_samples = len / sizeof(float);
-
-    m_audio_new.resize(n_samples);
-    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
-
-    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
-
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-
-        if (m_audio_pos + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - m_audio_pos;
-
-            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
-            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
-
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = m_audio.size();
-        } else {
-            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
-
-            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
-            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
-        }
-    }
-}
-
-void audio_async::get(int ms, std::vector<float> & result) {
-    if (!m_dev_id_in) {
-        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
-        return;
-    }
-
-    if (!m_running) {
-        fprintf(stderr, "%s: not running!\n", __func__);
-        return;
-    }
-
-    result.clear();
-
-    {
-        std::lock_guard<std::mutex> lock(m_mutex);
-
-        if (ms <= 0) {
-            ms = m_len_ms;
-        }
-
-        size_t n_samples = (m_sample_rate * ms) / 1000;
-        if (n_samples > m_audio_len) {
-            n_samples = m_audio_len;
-        }
-
-        result.resize(n_samples);
-
-        int s0 = m_audio_pos - n_samples;
-        if (s0 < 0) {
-            s0 += m_audio.size();
-        }
-
-        if (s0 + n_samples > m_audio.size()) {
-            const size_t n0 = m_audio.size() - s0;
-
-            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
-            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
-        } else {
-            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
-        }
-    }
-}
-
-///////////////////////////
-
-std::string trim(const std::string & s) {
-    std::regex e("^\\s+|\\s+$");
-    return std::regex_replace(s, e, "");
-}
-
-std::string replace(const std::string & s, const std::string & from, const std::string & to) {
-    std::string result = s;
-    size_t pos = 0;
-    while ((pos = result.find(from, pos)) != std::string::npos) {
-        result.replace(pos, from.length(), to);
-        pos += to.length();
-    }
-    return result;
-}
-
-void high_pass_filter(std::vector<float> & data, float cutoff, float sample_rate) {
-    const float rc = 1.0f / (2.0f * M_PI * cutoff);
-    const float dt = 1.0f / sample_rate;
-    const float alpha = dt / (rc + dt);
-
-    float y = data[0];
-
-    for (size_t i = 1; i < data.size(); i++) {
-        y = alpha * (y + data[i] - data[i - 1]);
-        data[i] = y;
-    }
-}
-
-bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float vad_thold, float freq_thold, bool verbose) {
-    const int n_samples      = pcmf32.size();
-    const int n_samples_last = (sample_rate * last_ms) / 1000;
-
-    if (n_samples_last >= n_samples) {
-        // not enough samples - assume no speech
-        return false;
-    }
-
-    if (freq_thold > 0.0f) {
-        high_pass_filter(pcmf32, freq_thold, sample_rate);
-    }
-
-    float energy_all  = 0.0f;
-    float energy_last = 0.0f;
-
-    for (size_t i = 0; i < n_samples; i++) {
-        energy_all += fabsf(pcmf32[i]);
-
-        if (i >= n_samples - n_samples_last) {
-            energy_last += fabsf(pcmf32[i]);
-        }
-    }
-
-    energy_all  /= n_samples;
-    energy_last /= n_samples_last;
-
-    if (verbose) {
-        fprintf(stderr, "%s: energy_all: %f, energy_last: %f, vad_thold: %f, freq_thold: %f\n", __func__, energy_all, energy_last, vad_thold, freq_thold);
-    }
-
-    if (energy_last > vad_thold*energy_all) {
-        return false;
-    }
-
-    return true;
-}
-
-std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
-    const auto t_start = std::chrono::high_resolution_clock::now();
-
-    prob = 0.0f;
-    t_ms = 0;
-
-    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-    wparams.print_progress   = false;
-    wparams.print_special    = params.print_special;
-    wparams.print_realtime   = false;
-    wparams.print_timestamps = !params.no_timestamps;
-    wparams.translate        = params.translate;
-    wparams.no_context       = true;
-    wparams.single_segment   = true;
-    wparams.max_tokens       = params.max_tokens;
-    wparams.language         = params.language.c_str();
-    wparams.n_threads        = params.n_threads;
-
-    wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
-
-    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
-        return "";
-    }
-
-    int prob_n = 0;
-    std::string result;
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-
-        result += text;
-
-        const int n_tokens = whisper_full_n_tokens(ctx, i);
-        for (int j = 0; j < n_tokens; ++j) {
-            const auto token = whisper_full_get_token_data(ctx, i, j);
-
-            prob += token.p;
-            ++prob_n;
-        }
-    }
-
-    if (prob_n > 0) {
-        prob /= prob_n;
-    }
-
-    const auto t_end = std::chrono::high_resolution_clock::now();
-    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-
-    return result;
-}
-
-// compute similarity between two strings using Levenshtein distance
-float similarity(const std::string & s0, const std::string & s1) {
-    const size_t len0 = s0.size() + 1;
-    const size_t len1 = s1.size() + 1;
-
-    std::vector<int> col(len1, 0);
-    std::vector<int> prevCol(len1, 0);
-
-    for (size_t i = 0; i < len1; i++) {
-        prevCol[i] = i;
-    }
-
-    for (size_t i = 0; i < len0; i++) {
-        col[0] = i;
-        for (size_t j = 1; j < len1; j++) {
-            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
-        }
-        col.swap(prevCol);
-    }
-
-    const float dist = prevCol[len1 - 1];
-
-    return 1.0f - (dist / std::max(s0.size(), s1.size()));
-}
-
-// generated with ChatGPT
-std::map<std::string, std::string> k_prompts = {
-    { "Santa",
-R"(Kid: Hi Santa! Are you real?
-Santa: Of course I am, my dear! Ho ho ho!
-Kid: Can you please bring me a new toy for Christmas?
-Santa: I'll see what I can do, but you have to make sure to be a good boy or girl and listen to your parents.
-Kid: I will, Santa! Thank you!
-Santa: You're welcome, little one. Merry Christmas! Ho ho ho!
-Kid: Can you tell me how you deliver all the presents to all the kids in the world in one night?
-Santa: It's a secret, but I have a lot of help from my elves and my magical sleigh. And I have a special route that I follow to make sure I visit every child.
-Kid: Wow, that's amazing! Can I please have a ride in your sleigh sometime?
-Santa: I'm sorry, but only good boys and girls get to ride in my sleigh.
-)" },
-    { "Kid",
-R"(Kid: Hi Santa! Are you real?
-Santa: Of course I am, my dear! Ho ho ho!
-Kid: Can you please bring me a new toy for Christmas?
-Santa: I'll see what I can do, but you have to make sure to be a good boy or girl and listen to your parents.
-Kid: I will, Santa! Thank you!
-Kid: Can you tell me how you deliver all the presents to all the kids in the world in one night?
-Santa: It's a secret, but I have a lot of help from my elves and my magical sleigh. And I have a special route that I follow to make sure I visit every child.
-Kid: Wow, that's amazing! Can I please have a ride in your sleigh sometime?
-)" },
-};
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    // whisper init
-
-    struct whisper_context * ctx_wsp = whisper_init(params.model_wsp.c_str());
-
-    // gpt init
-
-    struct gpt2_context * ctx_gpt = gpt2_init(params.model_gpt.c_str());
-
-    // print some info about the processing
-    {
-        fprintf(stderr, "\n");
-        if (!whisper_is_multilingual(ctx_wsp)) {
-            if (params.language != "en" || params.translate) {
-                params.language = "en";
-                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-            }
-        }
-        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
-                __func__,
-                params.n_threads,
-                params.language.c_str(),
-                params.translate ? "translate" : "transcribe",
-                params.no_timestamps ? 0 : 1);
-
-        fprintf(stderr, "\n");
-    }
-
-
-    // init audio
-
-    audio_async audio(30*1000);
-    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
-        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
-        return 1;
-    }
-
-    audio.resume();
-
-    int n_iter = 0;
-
-    bool is_running  = true;
-    bool force_speak = params.person == "Kid";
-
-    float prob0 = 0.0f;
-    float prob  = 0.0f;
-
-    std::vector<float> pcmf32_cur;
-    std::vector<float> pcmf32_prompt;
-
-    if (k_prompts.find(params.person) == k_prompts.end()) {
-        fprintf(stderr, "%s: unknown person '%s'\n", __func__, params.person.c_str());
-        return 1;
-    }
-
-    gpt2_set_prompt(ctx_gpt, k_prompts.at(params.person).c_str());
-
-    const std::string person_other = params.person == "Santa" ? "Kid" : "Santa";
-    const int voice_id = params.person == "Santa" ? 5 : 2;
-
-    fprintf(stderr, "gpt-2: prompt_base:\n");
-    fprintf(stderr, "========================\n\n");
-    fprintf(stderr, "%s\n", gpt2_get_prompt(ctx_gpt));
-    fprintf(stderr, "========================\n\n");
-
-    // main loop
-    while (is_running) {
-        // handle Ctrl + C
-        {
-            SDL_Event event;
-            while (SDL_PollEvent(&event)) {
-                switch (event.type) {
-                    case SDL_QUIT:
-                        {
-                            is_running = false;
-                        } break;
-                    default:
-                        break;
-                }
-            }
-
-            if (!is_running) {
-                break;
-            }
-        }
-
-        // delay
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-        int64_t t_ms = 0;
-
-        {
-            audio.get(2000, pcmf32_cur);
-
-            if (vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
-                fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
-
-                audio.get(params.voice_ms, pcmf32_cur);
-
-                std::string text_heard = "Hey little one, what do you want for Christmas?";
-                if (!force_speak) {
-                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
-                }
-
-                force_speak = false;
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\[.*?\\]");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\(.*?\\)");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
-                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-
-                // take first line
-                text_heard = text_heard.substr(0, text_heard.find_first_of("\n"));
-
-                // remove leading and trailing whitespace
-                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
-                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
-
-                const std::vector<gpt_vocab::id> tokens = gpt2_tokenize(ctx_gpt, text_heard.c_str());
-
-                if (text_heard.empty() || tokens.empty()) {
-                    fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
-                    audio.clear();
-
-                    continue;
-                }
-
-                fprintf(stdout, "%s: Heard '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", text_heard.c_str(), "\033[0m", (int) t_ms);
-
-                std::string prompt_base = gpt2_get_prompt(ctx_gpt);
-
-                std::string text_to_speak;
-
-                {
-                    text_heard = person_other + ": " + text_heard;
-
-                    text_to_speak = gpt2_gen_text(ctx_gpt, (prompt_base + text_heard + "\n").c_str(), params.max_tokens);
-                    text_to_speak = std::regex_replace(text_to_speak, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-                    text_to_speak = text_to_speak.substr(0, text_to_speak.find_first_of("\n"));
-
-                    // remove first 2 lines of base prompt
-                    if (n_iter > 4) {
-                        {
-                            const size_t pos = prompt_base.find_first_of("\n");
-                            if (pos != std::string::npos) {
-                                prompt_base = prompt_base.substr(pos + 1);
-                            }
-                        }
-                        {
-                            const size_t pos = prompt_base.find_first_of("\n");
-                            if (pos != std::string::npos) {
-                                prompt_base = prompt_base.substr(pos + 1);
-                            }
-                        }
-                    }
-
-                    prompt_base += text_heard + "\n" + text_to_speak + "\n";
-                }
-
-                printf("%s\n", text_to_speak.c_str());
-
-                //printf("========================\n");
-                //printf("gpt-2: prompt_base:\n'%s'\n", prompt_base.c_str());
-                //printf("========================\n");
-
-                gpt2_set_prompt(ctx_gpt, prompt_base.c_str());
-
-                text_to_speak = ::replace(text_to_speak, params.person + ": ", "");
-                system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
-
-                audio.clear();
-
-                ++n_iter;
-            }
-        }
-    }
-
-    audio.pause();
-
-    whisper_print_timings(ctx_wsp);
-    whisper_free(ctx_wsp);
-
-    return 0;
-}
--- a/examples/twitch.sh
+++ b/examples/twitch.sh
@ -1,109 +0,0 @@
-#!/bin/bash
-#
-# Transcribe twitch.tv livestream by feeding audio input to whisper.cpp at regular intervals
-# Thanks to @keyehzy
-# ref: https://github.com/ggerganov/whisper.cpp/issues/209
-#
-# The script currently depends on the third-party tool "streamlink"
-# On Mac OS, you can install it via "brew install streamlink"
-#
-
-set -eo pipefail
-
-step=10
-model=base.en
-threads=4
-
-help()
-{
-    echo "Example program for captioning a livestream from twitch.tv."
-    echo
-    echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
-    echo "options:"
-    echo "-s       Step in seconds (default is $step)."
-    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large' (default is '$model')."
-    echo "-t       Number of threads to use."
-    echo "-h       Print this help page."
-    echo
-}
-
-check_requirements()
-{
-    if ! command -v ./main &>/dev/null; then
-        echo "whisper.cpp main executable is required (make)"
-        exit 1
-    fi
-
-    if ! command -v streamlink &>/dev/null; then
-        echo "streamlink is required (https://streamlink.github.io)"
-        exit 1
-    fi
-
-    if ! command -v ffmpeg &>/dev/null; then
-        echo "ffmpeg is required (https://ffmpeg.org)"
-        exit 1
-    fi
-}
-
-check_requirements
-
-while getopts ":s:m:t:h" option; do
-    case $option in
-	s)
-            step=$OPTARG;;
-	m)
-            model=$OPTARG;;
-	t)
-	    threads=$OPTARG;;
-	h)
-            help
-            exit;;
-	\?)
-	    help
-	    exit;;
-    esac
-done
-
-url=${@:$OPTIND:1}
-
-if [ -z $url ]; then
-    help
-    exit
-fi
-
-echo "Piping from streamlink url=$url model=$model step=$step threads=$threads"
-streamlink $url best -O 2>/dev/null | ffmpeg -loglevel quiet -i - -y -probesize 32 -y -ar 16000 -ac 1 -acodec pcm_s16le /tmp/whisper-live0.wav &
-
-if [ $? -ne 0 ]; then
-    printf "error: ffmpeg failed\n"
-    exit 1
-fi
-
-echo "Buffering stream... (this should take $step seconds)"
-sleep $(($step))
-
-set +e
-
-echo "Starting..."
-
-i=0
-SECONDS=0
-while true
-do
-    err=1
-    while [ $err -ne 0 ]; do
-        if [ $i -gt 0 ]; then
-            ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.wav -y -ss $(($i*$step-1)).5 -t $step -c copy /tmp/whisper-live.wav 2> /tmp/whisper-live.err
-        else
-            ffmpeg -loglevel quiet -v error -noaccurate_seek -i /tmp/whisper-live0.wav -y -ss $(($i*$step)) -t $step -c copy /tmp/whisper-live.wav 2> /tmp/whisper-live.err
-        fi
-        err=$(cat /tmp/whisper-live.err | wc -l)
-    done
-
-    ./main -t $threads -m ./models/ggml-$model.bin -f /tmp/whisper-live.wav --no-timestamps -otxt 2> /tmp/whispererr | tail -n 1
-
-    while [ $SECONDS -lt $((($i+1)*$step)) ]; do
-        sleep 1
-    done
-    ((i=i+1))
-done
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -1,47 +1,5 @@
-#
-# libmain
-#
-
-set(TARGET libmain)
-
-add_executable(${TARGET}
-    emscripten.cpp
-    )
-
-target_link_libraries(${TARGET} PRIVATE
-    whisper
-    )
-
-unset(EXTRA_FLAGS)
-
-if (WHISPER_WASM_SINGLE_FILE)
-    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
-    message(STATUS "Embedding WASM inside main.js")
-
-    add_custom_command(
-        TARGET ${TARGET} POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E copy
-        ${CMAKE_BINARY_DIR}/bin/libmain.js
-        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/whisper.wasm/main.js
-        )
-endif()
-
-set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
-    --bind \
-    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1024MB \
-    -s TOTAL_MEMORY=1024MB \
-    -s FORCE_FILESYSTEM=1 \
-    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
-    ${EXTRA_FLAGS} \
-    ")
-
-#
-# whisper.wasm
-#
-
 set(TARGET whisper.wasm)

-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html  ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/../helpers.js          ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/helpers.js @ONLY)
+configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/whisper.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/whisper.js  COPYONLY)
--- a/examples/whisper.wasm/emscripten.cpp
+++ b/examples/whisper.wasm/emscripten.cpp
@ -1,108 +0,0 @@
-#include "whisper.h"
-
-#include <emscripten.h>
-#include <emscripten/bind.h>
-
-#include <vector>
-#include <thread>
-
-std::thread g_worker;
-
-std::vector<struct whisper_context *> g_contexts(4, nullptr);
-
-EMSCRIPTEN_BINDINGS(whisper) {
-    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
-        if (g_worker.joinable()) {
-            g_worker.join();
-        }
-
-        for (size_t i = 0; i < g_contexts.size(); ++i) {
-            if (g_contexts[i] == nullptr) {
-                g_contexts[i] = whisper_init(path_model.c_str());
-                if (g_contexts[i] != nullptr) {
-                    return i + 1;
-                } else {
-                    return (size_t) 0;
-                }
-            }
-        }
-
-        return (size_t) 0;
-    }));
-
-    emscripten::function("free", emscripten::optional_override([](size_t index) {
-        if (g_worker.joinable()) {
-            g_worker.join();
-        }
-
-        --index;
-
-        if (index < g_contexts.size()) {
-            whisper_free(g_contexts[index]);
-            g_contexts[index] = nullptr;
-        }
-    }));
-
-    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
-        if (g_worker.joinable()) {
-            g_worker.join();
-        }
-
-        --index;
-
-        if (index >= g_contexts.size()) {
-            return -1;
-        }
-
-        if (g_contexts[index] == nullptr) {
-            return -2;
-        }
-
-        struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
-
-        params.print_realtime   = true;
-        params.print_progress   = false;
-        params.print_timestamps = true;
-        params.print_special    = false;
-        params.translate        = translate;
-        params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
-        params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
-        params.offset_ms        = 0;
-
-        std::vector<float> pcmf32;
-        const int n = audio["length"].as<int>();
-
-        emscripten::val heap = emscripten::val::module_property("HEAPU8");
-        emscripten::val memory = heap["buffer"];
-
-        pcmf32.resize(n);
-
-        emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(pcmf32.data()), n);
-        memoryView.call<void>("set", audio);
-
-        // print system information
-        {
-            printf("system_info: n_threads = %d / %d | %s\n",
-                    params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
-
-            printf("%s: processing %d samples, %.1f sec, %d threads, %d processors, lang = %s, task = %s ...\n",
-                    __func__, int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
-                    params.n_threads, 1,
-                    params.language,
-                    params.translate ? "translate" : "transcribe");
-
-            printf("\n");
-        }
-
-        // run the worker
-        {
-            g_worker = std::thread([index, params, pcmf32 = std::move(pcmf32)]() {
-                whisper_reset_timings(g_contexts[index]);
-                whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
-                whisper_print_timings(g_contexts[index]);
-            });
-        }
-
-        return 0;
-    }));
-}
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -550,6 +550,6 @@
                }
            }
        </script>
-        <script type="text/javascript" src="main.js"></script>
+        <script type="text/javascript" src="whisper.js"></script>
    </body>
 </html>
--- a/examples/yt-wsp.sh
+++ b/examples/yt-wsp.sh
@ -2,15 +2,7 @@

 # Small shell script to more easily automatically download and transcribe live stream VODs.
 # This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
-# Use `./examples/yt-wsp.sh help` to print help info.
-#
-# Sample usage:
-#
-#   git clone https://github.com/ggerganov/whisper.cpp
-#   cd whisper.cpp
-#   make
-#   ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
-#
+# Use `./transcribe-vod help` to print help info.

 # MIT License

@ -51,7 +43,7 @@ cleanup() {
 }

 print_help() {
-    echo "Usage: ./examples/yt-wsp.sh <video_url>"
+    echo "Usage: ./transcribe-vod <video_url>"
    echo "See configurable env variables in the script"
    echo "This will produce an MP4 muxed file called res.mp4 in the working directory"
    echo "Requirements: ffmpeg yt-dlp whisper"
@ -73,14 +65,7 @@ check_requirements() {
    if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
        WHISPER_EXECUTABLE="./main"
        if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
-            echo "Whisper is required (https://github.com/ggerganov/whisper.cpp):"
-            echo "Sample usage:"
-            echo ""
-            echo "  git clone https://github.com/ggerganov/whisper.cpp"
-            echo "  cd whisper.cpp"
-            echo "  make"
-            echo "  ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890"
-            echo ""
+            echo "Whisper is required (https://github.com/ggerganov/whisper.cpp)."
            exit 1
        fi
    fi
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@ -17,8 +17,8 @@ printf "Running benchmark for all models\n"
 printf "This can take a while!\n"
 printf "\n"

-printf "| CPU | OS | Config | Model | Th | Load | Enc. | Commit |\n"
-printf "| --- | -- | ------ | ----- | -- | ---- | ---- | ------ |\n"
+printf "| CPU | OS | Config | Model | Threads | Load [ms] | Encode [ms] | Commit |\n"
+printf "| --- | -- | ------ | ----- | ------- | --------- | ----------- | ------ |\n"

 for model in "${models[@]}"; do
    # run once to heat-up the cache
@ -34,10 +34,6 @@ for model in "${models[@]}"; do
    system_info=$(echo "$output" | grep "system_info")
    n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}')

-    # floor to milliseconds
-    load_time=${load_time%.*}
-    encode_time=${encode_time%.*}
-
    config=""

    if [[ $system_info == *"AVX2 = 1"* ]]; then
--- a/extra/deploy-wasm.sh
+++ b/extra/deploy-wasm.sh
@ -21,11 +21,10 @@ if [ $? -ne 0 ]; then
 fi

 # copy all wasm files to the node
-scp bin/whisper.wasm/* root@linode0:/var/www/html/whisper/         && scp bin/libmain.worker.js    root@linode0:/var/www/html/whisper/
+scp bin/whisper.wasm/* root@linode0:/var/www/html/whisper/         && scp bin/libwhisper.worker.js root@linode0:/var/www/html/whisper/
 scp bin/stream.wasm/*  root@linode0:/var/www/html/whisper/stream/  && scp bin/libstream.worker.js  root@linode0:/var/www/html/whisper/stream/
 scp bin/command.wasm/* root@linode0:/var/www/html/whisper/command/ && scp bin/libcommand.worker.js root@linode0:/var/www/html/whisper/command/
 scp bin/talk.wasm/*    root@linode0:/var/www/html/whisper/talk/    && scp bin/libtalk.worker.js    root@linode0:/var/www/html/whisper/talk/
-scp bin/bench.wasm/*   root@linode0:/var/www/html/whisper/bench/   && scp bin/libbench.worker.js   root@linode0:/var/www/html/whisper/bench/

 echo "Done"
 exit
--- a/ggml.c
+++ b/ggml.c
@ -69,10 +69,6 @@ static int sched_yield (void) {
 typedef void* thread_ret_t;
 #endif

-#ifdef __HAIKU__
-#define static_assert(cond, msg) _Static_assert(cond, msg)
-#endif
-
 #define GGML_DEBUG 0
 #define GGML_GELU_FP16

@ -100,6 +96,8 @@ typedef void* thread_ret_t;
 #include <Accelerate/Accelerate.h>
 #elif GGML_USE_OPENBLAS
 #include <cblas.h>
+// sgemm
+extern void sgemm_(char* transa, char* transb, int* m, int* n, int* k, float* alpha, float* a, int* lda, float* b, int* ldb, float* beta, float* c, int* ldc);
 #endif

 // floating point type used to accumulate sums
@ -155,8 +153,7 @@ static inline float fp32_from_bits(uint32_t w) {
    union {
        uint32_t as_bits;
        float as_value;
-    } fp32;
-    fp32.as_bits = w;
+    } fp32 = { w };
    return fp32.as_value;
 }

@ -164,8 +161,7 @@ static inline uint32_t fp32_to_bits(float f) {
 	union {
 		float as_value;
 		uint32_t as_bits;
-	} fp32;
-	fp32.as_value = f;
+	} fp32 = { f };
 	return fp32.as_bits;
 }

@ -4223,7 +4219,7 @@ bool ggml_compute_forward_mul_mat_use_blas(
    const int ne1 = dst->ne[1];

    // TODO: find the optimal values for these
-    if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
+    if (ggml_is_contiguous(src1) && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) {
        //printf("BLAS: %d %d %d\n", ne0, ne1, ne10);
        return true;
    }
@ -4300,6 +4296,7 @@ void ggml_compute_forward_mul_mat_f32(

 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
    if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
+        GGML_ASSERT(ggml_is_contiguous(src0));
        GGML_ASSERT(nb10 == sizeof(float));

        if (params->ith != 0) return;
@ -4591,22 +4588,25 @@ void ggml_compute_forward_mul_mat_f16_f32(
                //    }
                //}

+                // zT = y * xT
                {
-#if 1
-                    // zT = y * xT
-                    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
-                            ne11, ne01, ne10,
-                            1.0f,    y, ne00,
-                                     x, ne00,
-                            0.0f,    d, ne01);
-#else
-                    // zT = (xT * y)T
-                    cblas_sgemm(CblasColMajor, CblasTrans, CblasNoTrans,
-                            ne01, ne11, ne10,
-                            1.0f,    x, ne00,
-                                     y, ne00,
-                            0.0f,    d, ne01);
-#endif
+                    //cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+                    //        ne11, ne01, ne10,
+                    //        1.0f,    y, ne10,
+                    //                 x, ne10,
+                    //        0.0f,    d, ne01);
+
+                    // this is compatible with nvblas
+                    float one = 1.0f;
+                    float zero = 0.0f;
+                    sgemm_(
+                            "T", "N",
+                            &ne0, &ne1, &ne10,
+                            &one,
+                            x, &ne10,
+                            y, &ne10,
+                            &zero,
+                            d, &ne0);
                }
            }
        }
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -2,7 +2,6 @@

 pushd %~dp0
 set models_path=%CD%
-for %%d in (%~dp0..) do set root_path=%%~fd
 popd

 set argc=0
@ -48,9 +47,9 @@ if %ERRORLEVEL% neq 0 (
  goto :eof
 )

-echo Done! Model %model% saved in %root_path%\models\ggml-%model%.bin
+echo Done! Model %model% saved in %models_path%\models\ggml-%model%.bin
 echo You can now use it like this:
-echo main.exe -m %root_path%\models\ggml-%model%.bin -f %root_path%\samples\jfk.wav
+echo main.exe -m %models_path%\models\ggml-%model%.bin -f %models_path%\samples\jfk.wav

 goto :eof

--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,14 +1,4 @@
 if (EMSCRIPTEN)
-    #
-    # test-whisper-js
-
-    set(TEST_TARGET test-whisper-js)
-
-    add_test(NAME ${TEST_TARGET}
-        COMMAND node test-whisper.js --experimental-wasm-threads
-        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-        )
-
    return()
 endif()

--- a/tests/test-whisper.js
+++ b/tests/test-whisper.js
@ -1,58 +0,0 @@
-var factory = require('../bindings/javascript/whisper.js')
-
-factory().then(function(whisper) {
-    var fs = require('fs');
-
-    // to avoid reading WAV files and depending on some 3rd-party package, we read
-    // 32-bit float PCM directly. to genereate it:
-    //
-    //   $ ffmpeg -i samples/jfk.wav -f f32le -acodec pcm_f32le samples/jfk.pcmf32
-    //
-    let fname_wav   = "../samples/jfk.pcmf32";
-    let fname_model = "../models/ggml-base.en.bin";
-
-    // init whisper
-    {
-        // read binary data from file
-        var model_data = fs.readFileSync(fname_model);
-        if (model_data == null) {
-            console.log("whisper: failed to read model file");
-            process.exit(1);
-        }
-
-        // write binary data to WASM memory
-        whisper.FS_createDataFile("/", "whisper.bin", model_data, true, true);
-
-        // init the model
-        var ret = whisper.init("whisper.bin");
-        if (ret == false) {
-            console.log('whisper: failed to init');
-            process.exit(1);
-        }
-    }
-
-    // transcribe wav file
-    {
-        // read raw binary data
-        var pcm_data = fs.readFileSync(fname_wav);
-        if (pcm_data == null) {
-            console.log("whisper: failed to read wav file");
-            process.exit(1);
-        }
-
-        // convert to 32-bit float array
-        var pcm = new Float32Array(pcm_data.buffer);
-
-        // transcribe
-        var ret = whisper.full_default(pcm, "en", false);
-        if (ret != 0) {
-            console.log("whisper: failed to transcribe");
-            process.exit(1);
-        }
-    }
-
-    // free memory
-    {
-        whisper.free();
-    }
-});
--- a/whisper.cpp
+++ b/whisper.cpp
@ -429,12 +429,6 @@ struct whisper_context {
    int32_t exp_n_audio_ctx; // 0 - use default
 };

-template<typename T>
-static void read_safe(std::ifstream& fin, T& dest)
-{
-  fin.read((char*)& dest, sizeof(T));
-}
-
 // load the model from a ggml file
 //
 // file format:
@ -461,7 +455,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
    // verify magic
    {
        uint32_t magic;
-        read_safe(fin, magic);
+        fin.read((char *) &magic, sizeof(magic));
        if (magic != 0x67676d6c) {
            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
            return false;
@ -472,17 +466,17 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
    {
        auto & hparams = model.hparams;

-        read_safe(fin, hparams.n_vocab);
-        read_safe(fin, hparams.n_audio_ctx);
-        read_safe(fin, hparams.n_audio_state);
-        read_safe(fin, hparams.n_audio_head);
-        read_safe(fin, hparams.n_audio_layer);
-        read_safe(fin, hparams.n_text_ctx);
-        read_safe(fin, hparams.n_text_state);
-        read_safe(fin, hparams.n_text_head);
-        read_safe(fin, hparams.n_text_layer);
-        read_safe(fin, hparams.n_mels);
-        read_safe(fin, hparams.f16);
+        fin.read((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
+        fin.read((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
+        fin.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
+        fin.read((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
+        fin.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
+        fin.read((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
+        fin.read((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
+        fin.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
+        fin.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
+        fin.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
+        fin.read((char *) &hparams.f16,           sizeof(hparams.f16));

        assert(hparams.n_text_state == hparams.n_audio_state);

@ -530,8 +524,8 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
    {
        auto & filters = wctx.model.filters;

-        read_safe(fin, filters.n_mel);
-        read_safe(fin, filters.n_fft);
+        fin.read((char *) &filters.n_mel, sizeof(filters.n_mel));
+        fin.read((char *) &filters.n_fft, sizeof(filters.n_fft));

        filters.data.resize(filters.n_mel * filters.n_fft);
        fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float));
@ -540,7 +534,7 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
    // load vocab
    {
        int32_t n_vocab = 0;
-        read_safe(fin, n_vocab);
+        fin.read((char *) &n_vocab, sizeof(n_vocab));

        //if (n_vocab != model.hparams.n_vocab) {
        //    fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
@ -551,11 +545,10 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
        std::string word;
        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
-            read_safe(fin, len);
+            fin.read((char *) &len, sizeof(len));

-            std::vector<char> tmp(len); // create a buffer
-            fin.read( &tmp[0], tmp.size() ); // read to buffer
-            word.assign(&tmp[0], tmp.size());
+            word.resize(len);
+            fin.read((char *) word.data(), len);

            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
@ -735,9 +728,10 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = wctx.buf_model->size();
-        params.mem_buffer = wctx.buf_model->data();
+        struct ggml_init_params params = {
+            .mem_size   = wctx.buf_model->size(),
+            .mem_buffer = wctx.buf_model->data(),
+        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -944,9 +938,10 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx

    // create the ggml memory context
    {
-        struct ggml_init_params params;
-        params.mem_size   = wctx.buf_memory.size();
-        params.mem_buffer = wctx.buf_memory.data();
+        struct ggml_init_params params = {
+            .mem_size   = wctx.buf_memory.size(),
+            .mem_buffer = wctx.buf_memory.data(),
+        };

        model.ctx_mem = ggml_init(params);
        if (!model.ctx_mem) {
@ -1003,9 +998,9 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
            int32_t length;
            int32_t ftype;

-            read_safe(fin, n_dims);
-            read_safe(fin, length);
-            read_safe(fin, ftype);
+            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));

            if (fin.eof()) {
                break;
@ -1014,14 +1009,12 @@ static bool whisper_model_load(const std::string & fname, whisper_context & wctx
            int32_t nelements = 1;
            int32_t ne[3] = { 1, 1, 1 };
            for (int i = 0; i < n_dims; ++i) {
-                read_safe(fin, ne[i]);
+                fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
                nelements *= ne[i];
            }

-            std::string name;
-            std::vector<char> tmp(length); // create a buffer
-            fin.read( &tmp[0], tmp.size() ); // read to buffer
-            name.assign(&tmp[0], tmp.size());
+            std::string name(length, 0);
+            fin.read(&name[0], length);

            if (model.tensors.find(name.data()) == model.tensors.end()) {
                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
@ -1095,9 +1088,10 @@ static bool whisper_encode(
    const int n_mels = hparams.n_mels;
    assert(mel_inp.n_mel == n_mels);

-    struct ggml_init_params params;
-    params.mem_size   = wctx.buf_compute.size();
-    params.mem_buffer = wctx.buf_compute.data();   
+    struct ggml_init_params params = {
+        .mem_size   = wctx.buf_compute.size(),
+        .mem_buffer = wctx.buf_compute.data(),
+    };

    struct ggml_context * ctx0 = ggml_init(params);

@ -1172,9 +1166,10 @@ static bool whisper_encode(

        // create separate context for each layer to reduce memory usage

-        struct ggml_init_params paramsL;
-        paramsL.mem_size   = wctx.buf_compute_layer.size();
-        paramsL.mem_buffer = wctx.buf_compute_layer.data();
+        struct ggml_init_params paramsL = {
+            .mem_size   = wctx.buf_compute_layer.size(),
+            .mem_buffer = wctx.buf_compute_layer.data(),
+        };

        struct ggml_context * ctxL = ggml_init(paramsL);

@ -1508,9 +1503,10 @@ static bool whisper_decode(
    const int N = n_tokens;
    const int M = wctx.exp_n_audio_ctx > 0 ? wctx.exp_n_audio_ctx : hparams.n_audio_ctx;

-    struct ggml_init_params params;
-    params.mem_size   = wctx.buf_compute.size();
-    params.mem_buffer = wctx.buf_compute.data();
+    struct ggml_init_params params = {
+            .mem_size   = wctx.buf_compute.size(),
+            .mem_buffer = wctx.buf_compute.data(),
+        };

    struct ggml_context * ctx0 = ggml_init(params);

@ -1533,9 +1529,10 @@ static bool whisper_decode(
    for (int il = 0; il < n_layer; ++il) {
        const auto & layer = model.layers_decoder[il];

-        struct ggml_init_params paramsL;
-        paramsL.mem_size   = wctx.buf_compute_layer.size();
-        paramsL.mem_buffer = wctx.buf_compute_layer.data();
+        struct ggml_init_params paramsL = {
+            .mem_size   = wctx.buf_compute_layer.size(),
+            .mem_buffer = wctx.buf_compute_layer.data(),
+        };

        struct ggml_context * ctxL = ggml_init(paramsL);
        struct ggml_cgraph gf = {};
@ -2690,7 +2687,6 @@ int whisper_full(
        tokens_cur.clear();

        bool failed = false;
-        bool has_ts = false; // have we already sampled a non-beg timestamp token for the current segment?

        for (int i = 0, n_max = whisper_n_text_ctx(ctx)/2 - 4; i < n_max; ++i) {
            if (whisper_decode(ctx, prompt.data(), prompt.size(), n_past, params.n_threads) != 0) {
@ -2716,13 +2712,13 @@ int whisper_full(
                    const int seek_delta_new = 2*(token.id - whisper_token_beg(ctx));

                    // do not allow to go back in time
-                    if (has_ts && seek_delta > seek_delta_new && result_len < i) {
+                    if (seek_delta != 100*WHISPER_CHUNK_SIZE &&
+                        seek_delta > seek_delta_new && result_len < i) {
                        break;
                    }

                    seek_delta = seek_delta_new;
                    result_len = i + 1;
-                    has_ts = true;
                }

                // add it to the context
@ -2734,11 +2730,8 @@ int whisper_full(
                //    printf("%s: %10s %6d %6.3f '%s'\n", __func__, tt.c_str(), token.id, token.pt, ctx->vocab.id_to_token[token.id].c_str());
                //}

-                // end of segment
-                if (token.id == whisper_token_eot(ctx) ||               // end of text token
-                    (params.max_tokens > 0 && i > params.max_tokens) || // max tokens per segment reached
-                    (has_ts && seek + seek_delta + 100 >= seek_end)     // end of audio reached
-                    ) {
+                // end of text token
+                if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) {
                    if (result_len == 0) {
                        if (seek + seek_delta + 100 >= seek_end) {
                            result_len = i + 1;
@ -2909,9 +2902,10 @@ int whisper_full_parallel(

        // create the ggml memory context
        {
-            struct ggml_init_params params;
-            params.mem_size   = ctxs[i].buf_memory.size();
-            params.mem_buffer = ctxs[i].buf_memory.data();
+            struct ggml_init_params params = {
+                .mem_size   = ctxs[i].buf_memory.size(),
+                .mem_buffer = ctxs[i].buf_memory.data(),
+            };

            model.ctx_mem = ggml_init(params);
            if (!model.ctx_mem) {
				`@ -1 +0,0 @@`
				"use strict";var Module={};var ENVIRONMENT_IS_NODE=typeof process=="object"&&typeof process.versions=="object"&&typeof process.versions.node=="string";if(ENVIRONMENT_IS_NODE){var nodeWorkerThreads=require("worker_threads");var parentPort=nodeWorkerThreads.parentPort;parentPort.on("message",data=>onmessage({data:data}));var fs=require("fs");Object.assign(global,{self:global,require:require,Module:Module,location:{href:__filename},Worker:nodeWorkerThreads.Worker,importScripts:function(f){(0,eval)(fs.readFileSync(f,"utf8")+"//# sourceURL="+f)},postMessage:function(msg){parentPort.postMessage(msg)},performance:global.performance\|\|{now:function(){return Date.now()}}})}var initializedJS=false;var pendingNotifiedProxyingQueues=[];function threadPrintErr(){var text=Array.prototype.slice.call(arguments).join(" ");if(ENVIRONMENT_IS_NODE){fs.writeSync(2,text+"\n");return}console.error(text)}function threadAlert(){var text=Array.prototype.slice.call(arguments).join(" ");postMessage({cmd:"alert",text:text,threadId:Module["_pthread_self"]()})}var err=threadPrintErr;self.alert=threadAlert;Module["instantiateWasm"]=(info,receiveInstance)=>{var instance=new WebAssembly.Instance(Module["wasmModule"],info);receiveInstance(instance);Module["wasmModule"]=null;return instance.exports};self.onunhandledrejection=e=>{throw e.reason??e};self.onmessage=e=>{try{if(e.data.cmd==="load"){Module["wasmModule"]=e.data.wasmModule;for(const handler of e.data.handlers){Module[handler]=function(){postMessage({cmd:"callHandler",handler:handler,args:[...arguments]})}}Module["wasmMemory"]=e.data.wasmMemory;Module["buffer"]=Module["wasmMemory"].buffer;Module["ENVIRONMENT_IS_PTHREAD"]=true;if(typeof e.data.urlOrBlob=="string"){importScripts(e.data.urlOrBlob)}else{var objectUrl=URL.createObjectURL(e.data.urlOrBlob);importScripts(objectUrl);URL.revokeObjectURL(objectUrl)}whisper_factory(Module).then(function(instance){Module=instance})}else if(e.data.cmd==="run"){Module["__performance_now_clock_drift"]=performance.now()-e.data.time;Module["__emscripten_thread_init"](e.data.pthread_ptr,0,0,1);Module["establishStackSpace"]();Module["PThread"].receiveObjectTransfer(e.data);Module["PThread"].threadInitTLS();if(!initializedJS){Module["__embind_initialize_bindings"]();pendingNotifiedProxyingQueues.forEach(queue=>{Module["executeNotifiedProxyingQueue"](queue)});pendingNotifiedProxyingQueues=[];initializedJS=true}try{Module["invokeEntryPoint"](e.data.start_routine,e.data.arg)}catch(ex){if(ex!="unwind"){if(ex instanceof Module["ExitStatus"]){if(Module["keepRuntimeAlive"]()){}else{Module["__emscripten_thread_exit"](ex.status)}}else{throw ex}}}}else if(e.data.cmd==="cancel"){if(Module["_pthread_self"]()){Module["__emscripten_thread_exit"](-1)}}else if(e.data.target==="setimmediate"){}else if(e.data.cmd==="processProxyingQueue"){if(initializedJS){Module["executeNotifiedProxyingQueue"](e.data.queue)}else{pendingNotifiedProxyingQueues.push(e.data.queue)}}else if(e.data.cmd){err("worker.js received unknown command "+e.data.cmd);err(e.data)}}catch(ex){if(Module["__emscripten_thread_crashed"]){Module["__emscripten_thread_crashed"]()}throw ex}};