wip : try to compress just mlp

wip : experimenting
2025-07-05 00:41:06 +02:00 · 2022-10-08 15:12:15 +03:00 · 2022-10-08 14:08:43 +03:00
63 changed files with 1021 additions and 5459 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,22 +1,7 @@
-*.o
+sync.sh
 .cache/
 .vs/
 .vscode/
 .DS_Store
 build/
 build-em/
 build-debug/
 build-release/
 build-sanitize-addr/
 build-sanitize-thread/
 main
 stream
-bench
+*.o
-sync.sh
+.cache
 build/
 compile_commands.json
 examples/whisper.objc/whisper.objc.xcodeproj/xcshareddata
 examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +0,0 @@
 [submodule "bindings/ios"]
 	path = bindings/ios
 	url = https://github.com/ggerganov/whisper.spm
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -7,33 +7,12 @@ set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
    set(WHISPER_STANDALONE ON)
    include(cmake/GitVars.cmake)
    include(cmake/BuildTypes.cmake)
    # configure project version
    if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
        configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
    endif()
 else()
    set(WHISPER_STANDALONE OFF)
 endif()
 if (EMSCRIPTEN)
    set(BUILD_SHARED_LIBS_DEFAULT OFF)
    option(WHISPER_WASM_SINGLE_FILE "whisper: embed WASM inside the generated whisper.js" ON)
 else()
    if (MINGW)
        set(BUILD_SHARED_LIBS_DEFAULT OFF)
    else()
        set(BUILD_SHARED_LIBS_DEFAULT ON)
    endif()
 endif()
 # options
 option(BUILD_SHARED_LIBS               "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
 option(WHISPER_ALL_WARNINGS            "whisper: enable all compiler warnings" ON)
 option(WHISPER_ALL_WARNINGS_3RD_PARTY  "whisper: enable all compiler warnings in 3rd party libs" OFF)
@ -42,35 +21,24 @@ option(WHISPER_SANITIZE_ADDRESS        "whisper: enable address sanitizer"   OFF
 option(WHISPER_SANITIZE_UNDEFINED      "whisper: enable undefined sanitizer" OFF)
 option(WHISPER_BUILD_TESTS             "whisper: build tests" ${WHISPER_STANDALONE})
 option(WHISPER_BUILD_EXAMPLES          "whisper: build examples" ${WHISPER_STANDALONE})
 option(WHISPER_SUPPORT_SDL2            "whisper: support for libSDL2" OFF)
 if (APPLE)
    option(WHISPER_NO_ACCELERATE       "whisper: disable Accelerate framework" OFF)
 else()
    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
 option(WHISPER_PERF                    "whisper: enable perf timings" OFF)
 # sanitizers
-if (NOT MSVC)
+if (WHISPER_SANITIZE_THREAD)
    if (WHISPER_SANITIZE_THREAD)
    set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS} -fsanitize=thread")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
-    endif()
+endif()
-    if (WHISPER_SANITIZE_ADDRESS)
+if (WHISPER_SANITIZE_ADDRESS)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=address -fno-omit-frame-pointer")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
-    endif()
+endif()
-    if (WHISPER_SANITIZE_UNDEFINED)
+if (WHISPER_SANITIZE_UNDEFINED)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}     -fsanitize=undefined")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
    endif()
 endif()
 #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
@ -79,44 +47,18 @@ endif()
 # dependencies
 set(CMAKE_C_STANDARD   11)
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 11)
 find_package(Threads REQUIRED)
-# on APPLE - include Accelerate framework
+if (WHISPER_SUPPORT_SDL2)
-if (APPLE AND NOT WHISPER_NO_ACCELERATE)
+    # SDL2
-    find_library(ACCELERATE_FRAMEWORK Accelerate)
+    find_package(SDL2 REQUIRED)
    if (ACCELERATE_FRAMEWORK)
        message(STATUS "Accelerate framework found")
-        set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+    string(STRIP "${SDL2_LIBRARIES}" SDL2_LIBRARIES)
        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
    else()
        message(WARNING "Accelerate framework not found")
    endif()
-    find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+    message(STATUS "SDL2_INCLUDE_DIRS = ${SDL2_INCLUDE_DIRS}")
-    find_library(METAL_FRAMEWORK    Metal      REQUIRED)
+    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
    find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)
    find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
    set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS}
        ${FOUNDATION_LIBRARY}
        ${METAL_FRAMEWORK}
        ${METALKIT_FRAMEWORK}
        ${METALPERFORMANCE_FRAMEWORK})
 endif()
 if (WHISPER_SUPPORT_OPENBLAS)
    find_library(OPENBLAS_LIB openblas)
    if (OPENBLAS_LIB)
        message(STATUS "OpenBLAS found")
        set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${OPENBLAS_LIB})
        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
    else()
        message(WARNING "OpenBLAS not found")
    endif()
 endif()
 # compiler flags
@ -127,7 +69,7 @@ if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
 endif ()
 if (WHISPER_ALL_WARNINGS)
-    if (NOT MSVC)
+    if (CMAKE_COMPILER_IS_GNUCC OR CMAKE_C_COMPILER_ID MATCHES "Clang")
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
            -Wall                           \
            -Wextra                         \
@ -138,14 +80,12 @@ if (WHISPER_ALL_WARNINGS)
            -Wpointer-arith                 \
        ")
    else()
-        # todo : msvc
+        # todo : windows
    endif()
 endif()
-if (NOT MSVC)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
+#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
    #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
 endif()
 message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
@ -153,33 +93,15 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
    message(STATUS "ARM detected")
 else()
    message(STATUS "x86 detected")
    if (MSVC)
        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
        set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /arch:AVX2")
    else()
        if (EMSCRIPTEN)
            # we require support for WASM SIMD 128-bit
            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread -msimd128")
            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
        else()
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx -mavx2 -mfma -mf16c")
        endif()
    endif()
 endif()
 if (WHISPER_PERF)
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
 endif()
 #
 # whisper - this is the main library of the project
 #
 set(TARGET whisper)
 add_library(${TARGET}
    ggml.c
    ggml-mtl.m
    whisper.cpp
    )
@ -187,13 +109,7 @@ target_include_directories(${TARGET} PUBLIC
    .
    )
-if (MSVC)
+target_link_libraries(${TARGET} PRIVATE ${CMAKE_THREAD_LIBS_INIT})
    target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
 else()
    target_link_libraries(${TARGET} PRIVATE m ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
 endif()
 if (BUILD_SHARED_LIBS)
    target_link_libraries(${TARGET} PUBLIC
@ -214,23 +130,24 @@ install(TARGETS ${TARGET}
    ARCHIVE DESTINATION lib/static
    )
 #
 # bindings
 #
 add_subdirectory(bindings)
 #
 # programs, examples and tests
 #
 if (WHISPER_STANDALONE)
    # main
    set(TARGET main)
    add_executable(${TARGET} main.cpp)
    target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
    if (WHISPER_SUPPORT_SDL2)
        # stream
        set(TARGET stream)
        add_executable(${TARGET} stream.cpp)
        target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
        target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
    endif ()
    if (WHISPER_BUILD_TESTS)
        enable_testing()
        add_subdirectory(tests)
    endif ()
    if (WHISPER_BUILD_EXAMPLES)
        add_subdirectory(examples)
    endif()
 endif ()
--- a/77
+++ b/77
@ -1,35 +1,16 @@
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
 ifndef UNAME_P
 UNAME_P := $(shell uname -p)
 endif
 ifndef UNAME_M
 UNAME_M := $(shell uname -m)
 endif
 # Mac OS + Arm can report x86_64
 # ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
 ifeq ($(UNAME_S),Darwin)
 	ifneq ($(UNAME_P),arm)
 		SYSCTL_M := $(shell sysctl -n hw.optional.arm64)
 		ifeq ($(SYSCTL_M),1)
 			# UNAME_P := arm
 			# UNAME_M := arm64
 			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
 		endif
 	endif
 endif
 #
 # Compile flags
 #
-CFLAGS   = -I.              -O3 -std=c11  
+CFLAGS   = -O3 -std=c11
-CXXFLAGS = -I. -I./examples -O3 -std=c++11
+CXXFLAGS = -O3 -std=c++11
-LDFLAGS  =
+
 CFLAGS   += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
 CXXFLAGS += -Wall -Wextra -Wno-unused-parameter -Wno-unused-function
 # OS specific
 # TODO: support Windows
@ -41,30 +22,17 @@ ifeq ($(UNAME_S),Darwin)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 ifeq ($(UNAME_S),FreeBSD)
 	CFLAGS   += -pthread
 	CXXFLAGS += -pthread
 endif
 # Architecture specific
-# TODO: probably these flags need to be tweaked on some architectures
+ifeq ($(UNAME_P),x86_64)
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),x86_64)
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif
-ifeq ($(UNAME_M),amd64)
+ifneq ($(filter arm%,$(UNAME_P)),)
-	CFLAGS += -mavx -mavx2 -mfma -mf16c
+	# Mac M1
 endif
-ifndef WHISPER_NO_ACCELERATE
+ifneq ($(filter aarch64%,$(UNAME_P)),)
 	# Mac M1 - include Accelerate framework
 	ifeq ($(UNAME_S),Darwin)
 		CFLAGS  += -DGGML_USE_ACCELERATE -DGGML_PERF
 		LDFLAGS += -framework Foundation -framework Accelerate -framework Metal -framework MetalKit -framework MetalPerformanceShaders
 	endif
-endif
+	ifneq ($(filter armv6%,$(UNAME_M)),)
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, 2, 3
 	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif
@ -81,24 +49,18 @@ endif
 # Build library + main
 #
-main: examples/main/main.cpp ggml.o ggml-mtl.o whisper.o
+main: main.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp whisper.o ggml.o ggml-mtl.o -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) main.cpp whisper.o ggml.o -o main
 	./main -h
 ggml.o: ggml.c ggml.h
-	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
+	$(CC)  $(CFLAGS)   -c ggml.c
 ggml-mtl.o: ggml-mtl.m ggml-mtl.h
 	$(CC)  $(CFLAGS)   -c ggml-mtl.m -o ggml-mtl.o
 whisper.o: whisper.cpp whisper.h
-	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
+	$(CXX) $(CXXFLAGS) -c whisper.cpp
 libwhisper.a: ggml.o ggml-mtl.o whisper.o
 	$(AR) rcs libwhisper.a ggml.o ggml-mtl.o whisper.o
 clean:
-	rm -f *.o main stream bench libwhisper.a
+	rm -f *.o main
 #
 # Examples
@ -106,11 +68,8 @@ clean:
 CC_SDL=`sdl2-config --cflags --libs`
-stream: examples/stream/stream.cpp ggml.o whisper.o
+stream: stream.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) stream.cpp ggml.o whisper.o -o stream $(CC_SDL)
 bench: examples/bench/bench.cpp ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
 #
 # Audio samples
@ -150,7 +109,7 @@ samples:
 .PHONY: large
 tiny.en tiny base.en base small.en small medium.en medium large: main
-	bash ./models/download-ggml-model.sh $@
+	bash ./download-ggml-model.sh $@
 	@echo ""
 	@echo "==============================================="
 	@echo "Running $@ on all samples in ./samples ..."
--- a/README.md
+++ b/README.md
@ -6,77 +6,26 @@
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
 - Plain C/C++ implementation without dependencies
- Apple silicon first-class citizen - optimized via Arm Neon and Accelerate framework
+- ARM_NEON and AVX intrinsics support
 - AVX intrinsics support for x86 architectures
 - Mixed F16 / F32 precision
 - Low memory usage (Flash Attention + Flash Forward)
 - Zero memory allocations at runtime
 - Runs on the CPU
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
 - Supported platforms: Linux, Mac OS (Intel and Arm), Raspberry Pi, Android
-Supported platforms:
+## Usage
- [x] Mac OS (Intel and Arm)
+To build the main program, run `make`. You can then transcribe a `.wav` file like this:
 - [x] [iOS](examples/whisper.objc)
 - [x] Linux
 - [x] [WebAssembly](examples/whisper.wasm)
 - [x] [Windows (MSVC and MinGW)](https://github.com/ggerganov/whisper.cpp/issues/5)
 - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/issues/7)
 - [x] [Android](https://github.com/ggerganov/whisper.cpp/issues/30)
 The entire implementation of the model is contained in 2 source files:
 - Tensor operations: [ggml.h](ggml.h) / [ggml.c](ggml.c)
 - Transformer inference: [whisper.h](whisper.h) / [whisper.cpp](whisper.cpp)
 Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
 As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device:
 https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
 ## Implementation details
 - The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
 - The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
 - Sample usage is demonstrated in [main.cpp](examples/main)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
 - Various other examples are available in the [examples](examples) folder
 The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
 instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
 the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
 ## Limitations
 - Inference only
 - No GPU support
 - Very basic greedy sampling scheme - always pick up the token with highest probability.
  This should be similar to the [GreedyDecoder](https://github.com/openai/whisper/blob/main/whisper/decoding.py#L249-L274)
  from the original python implementation, so in order to make a fair comparison between the 2 implementations, make sure
  to run the python code with the following parameters:
  ```
  whisper --best_of None --beam_size None ...
  ```
  In the future, `whisper.cpp` will support more sampling strategies.
 ## Quick start
 First, download one of the Whisper models converted in [ggml format](models). For example:
 ```bash
-bash ./models/download-ggml-model.sh base.en
+$ ./main -f input.wav
 ```
-Now build the [main](examples/main) example and transcribe an audio file like this:
+Before running the program, make sure to download one of the ggml Whisper models. For example:
 ```bash
-# build the main example
+bash ./download-ggml-model.sh base.en
 make
 # transcribe an audio file
 ./main -f input.wav
 ```
 ---
@ -85,10 +34,9 @@ For a quick demo, simply run `make base.en`:
 ```java
 $ make base.en
-
+cc  -O3 -std=c11 -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -pthread   -c ggml.c
-cc  -I.              -O3 -std=c11   -pthread -DGGML_USE_ACCELERATE   -c ggml.c -o ggml.o
+c++ -O3 -std=c++11 -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -pthread -c whisper.cpp
-c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o
+c++ -O3 -std=c++11 -Wall -Wextra -Wno-unused-parameter -Wno-unused-function -pthread main.cpp whisper.o ggml.o -o main
 c++ -I. -I./examples -O3 -std=c++11 -pthread examples/main/main.cpp whisper.o ggml.o -o main  -framework Accelerate
 ./main -h
 usage: ./main [options] file0.wav file1.wav ...
@ -97,28 +45,17 @@ options:
  -h,       --help           show this help message and exit
  -s SEED,  --seed SEED      RNG seed (default: -1)
  -t N,     --threads N      number of threads to use during computation (default: 4)
  -p N,     --processors N   number of processors to use during computation (default: 1)
  -ot N,    --offset-t N     time offset in milliseconds (default: 0)
  -on N,    --offset-n N     segment index offset (default: 0)
  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)
  -ml N,    --max-len N      maximum segment length in characters (default: 0)
  -wt N,    --word-thold N   word timestamp probability threshold (default: 0.010000)
  -v,       --verbose        verbose output
            --translate      translate from source language to english
  -otxt,    --output-txt     output result in a text file
  -ovtt,    --output-vtt     output result in a vtt file
  -osrt,    --output-srt     output result in a srt file
  -owts,    --output-words   output script for generating karaoke video
  -ps,      --print_special  print special tokens
  -pc,      --print_colors   print colors
  -nt,      --no_timestamps  do not print timestamps
  -l LANG,  --language LANG  spoken language (default: en)
  -m FNAME, --model FNAME    model path (default: models/ggml-base.en.bin)
  -f FNAME, --file FNAME     input WAV file path
-bash ./models/download-ggml-model.sh base.en
+bash ./download-ggml-model.sh base.en
 Downloading ggml model base.en ...
-ggml-base.en.bin               100%[========================>] 141.11M  6.34MB/s    in 24s
+models/ggml-base.en.bin            100%[===================================>] 141.11M  6.49MB/s    in 23s
 Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
 You can now use it like this:
@ -146,33 +83,30 @@ whisper_model_load: n_text_layer  = 6
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 2
-whisper_model_load: mem_required  = 670.00 MB
+whisper_model_load: mem_required  = 377.00 MB
 whisper_model_load: adding 1607 extra tokens
-whisper_model_load: ggml ctx size = 140.60 MB
+whisper_model_load: ggml ctx size = 163.43 MB
 whisper_model_load: memory size =    22.83 MB
 whisper_model_load: model size  =   140.54 MB
-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 |
+main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, lang = en, task = transcribe, timestamps = 1 ...
-main: processing 'samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
+[00:00.000 --> 00:11.000]   And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
-[00:00:00.000 --> 00:00:11.000]   And so my fellow Americans, ask not what your country can do for you, ask what you can do for your country.
+whisper_print_timings:     load time =    77.48 ms
-
+whisper_print_timings:      mel time =    26.10 ms
-
+whisper_print_timings:   sample time =     2.19 ms
-whisper_print_timings:     load time =   105.91 ms
+whisper_print_timings:   encode time =   632.95 ms / 105.49 ms per layer
-whisper_print_timings:      mel time =    24.62 ms
+whisper_print_timings:   decode time =    85.11 ms / 14.18 ms per layer
-whisper_print_timings:   sample time =     3.63 ms
+whisper_print_timings:    total time =   824.14 ms
 whisper_print_timings:   encode time =   324.71 ms / 54.12 ms per layer
 whisper_print_timings:   decode time =    83.58 ms / 13.93 ms per layer
 whisper_print_timings:    total time =   542.81 ms
 ```
 The command downloads the `base.en` model converted to custom `ggml` format and runs the inference on all `.wav` samples in the folder `samples`.
 For detailed usage instructions, run: `./main -h`
-Note that the [main](examples/main) example currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
+Note that `whisper.cpp` currently runs only with 16-bit WAV files, so make sure to convert your input before running the tool.
 For example, you can use `ffmpeg` like this:
 ```java
@ -203,23 +137,10 @@ make medium
 make large
 ```
 ## Memory usage
 | Model  | Disk   | Mem     | SHA                                        |
 | ---    | ---    | ---     | ---                                        |
 | tiny   |  75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
 | base   | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
 | small  | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
 | medium | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | large  | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
 ## Another example
 Here is another example of transcribing a [3:24 min speech](https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg)
-in about half a minute on a MacBook M1 Pro, using `medium.en` model:
+in less than a minute on a MacBook M1 Pro, using `medium.en` model:
 <details>
  <summary>Expand to see the result</summary>
 ```java
 $ ./main -m models/ggml-medium.en.bin -f samples/gb1.wav -t 8
@ -237,187 +158,86 @@ whisper_model_load: n_text_layer  = 24
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 4
-whisper_model_load: mem_required  = 2610.00 MB
+whisper_model_load: mem_required  = 2502.00 MB
 whisper_model_load: adding 1607 extra tokens
 whisper_model_load: ggml ctx size = 1644.97 MB
 whisper_model_load: memory size =   182.62 MB
 whisper_model_load: model size  =  1462.12 MB
 log_mel_spectrogram: n_sample = 3179750, n_len = 19873
 log_mel_spectrogram: recording length: 198.734375 s
-main: processing 'samples/gb1.wav' (3179750 samples, 198.7 sec), 8 threads, lang = en, task = transcribe, timestamps = 1 ...
+main: processing 3179750 samples (198.7 sec), 8 threads, lang = english, task = transcribe, timestamps = 1 ...
 [00:00.000 --> 00:08.000]   My fellow Americans, this day has brought terrible news and great sadness to our country.
-[00:08.000 --> 00:17.000]   At nine o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
+[00:08.000 --> 00:17.000]   At 9 o'clock this morning, Mission Control in Houston lost contact with our Space Shuttle Columbia.
-[00:17.000 --> 00:23.000]   A short time later, debris was seen falling from the skies above Texas.
+[00:17.000 --> 00:24.000]   A short time later, debris was seen falling from the skies above Texas.
-[00:23.000 --> 00:29.000]   The Columbia's lost. There are no survivors.
+[00:24.000 --> 00:29.000]   The Columbia's lost. There are no survivors.
 [00:29.000 --> 00:32.000]   On board was a crew of seven.
-[00:32.000 --> 00:39.000]   Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark,
+[00:32.000 --> 00:43.000]   Colonel Rick Husband, Lieutenant Colonel Michael Anderson, Commander Laurel Clark, Captain David Brown, Commander William McCool,
-[00:39.000 --> 00:48.000]   Captain David Brown, Commander William McCool, Dr. Kultna Shavla, and Ilan Ramon,
+[00:43.000 --> 00:52.000]   Dr. Kultner Aschavla, and Elon Ramon, a Colonel in the Israeli Air Force.
 [00:48.000 --> 00:52.000]   a colonel in the Israeli Air Force.
 [00:52.000 --> 00:58.000]   These men and women assumed great risk in the service to all humanity.
-[00:58.000 --> 01:03.000]   In an age when space flight has come to seem almost routine,
+[00:58.000 --> 01:06.000]   In an age when space flight has come to seem almost routine, it is easy to overlook the dangers of travel by rocket
-[01:03.000 --> 01:07.000]   it is easy to overlook the dangers of travel by rocket
+[01:06.000 --> 01:12.000]   and the difficulties of navigating the fierce outer atmosphere of the Earth.
-[01:07.000 --> 01:12.000]   and the difficulties of navigating the fierce outer atmosphere of the Earth.
+[01:12.000 --> 01:22.000]   These astronauts knew the dangers, and they faced them willingly, knowing they had a high and noble purpose in life.
-[01:12.000 --> 01:18.000]   These astronauts knew the dangers, and they faced them willingly,
+[01:22.000 --> 01:30.000]   Because of their courage, endearing, and idealism, we will miss them all the more.
-[01:18.000 --> 01:23.000]   knowing they had a high and noble purpose in life.
+[01:30.000 --> 01:40.000]   All Americans today are thinking as well of the families of these men and women who have been given this sudden shock and grief.
-[01:23.000 --> 01:31.000]   Because of their courage and daring and idealism, we will miss them all the more.
+[01:40.000 --> 01:45.000]   You're not alone. Our entire nation agrees with you.
-[01:31.000 --> 01:36.000]   All Americans today are thinking as well of the families of these men and women
+[01:45.000 --> 01:52.000]   And those you love will always have the respect and gratitude of this country.
 [01:36.000 --> 01:40.000]   who have been given this sudden shock and grief.
 [01:40.000 --> 01:45.000]   You're not alone. Our entire nation grieves with you,
 [01:45.000 --> 01:52.000]   and those you love will always have the respect and gratitude of this country.
 [01:52.000 --> 01:56.000]   The cause in which they died will continue.
-[01:56.000 --> 02:04.000]   Mankind is led into the darkness beyond our world by the inspiration of discovery
+[01:56.000 --> 02:07.000]   Mankind is led into the darkness beyond our world by the inspiration of discovery and the longing to understand.
-[02:04.000 --> 02:11.000]   and the longing to understand. Our journey into space will go on.
+[02:07.000 --> 02:11.000]   Our journey into space will go on.
 [02:11.000 --> 02:16.000]   In the skies today, we saw destruction and tragedy.
 [02:16.000 --> 02:22.000]   Yet farther than we can see, there is comfort and hope.
-[02:22.000 --> 02:29.000]   In the words of the prophet Isaiah, "Lift your eyes and look to the heavens
+[02:22.000 --> 02:31.000]   In the words of the prophet Isaiah, "Lift your eyes and look to the heavens who created all these.
-[02:29.000 --> 02:35.000]   who created all these. He who brings out the starry hosts one by one
+[02:31.000 --> 02:39.000]   He who brings out the starry hosts one by one and calls them each by name."
-[02:35.000 --> 02:39.000]   and calls them each by name."
+[02:39.000 --> 02:46.000]   Because of his great power and mighty strength, not one of them is missing.
-[02:39.000 --> 02:46.000]   Because of His great power and mighty strength, not one of them is missing.
+[02:46.000 --> 02:55.000]   The same creator who names the stars also knows the names of the seven souls we mourn today.
-[02:46.000 --> 02:55.000]   The same Creator who names the stars also knows the names of the seven souls we mourn today.
+[02:55.000 --> 03:05.000]   The crew of the shuttle Columbia did not return safely to Earth, yet we can pray that all are safely home.
-[02:55.000 --> 03:01.000]   The crew of the shuttle Columbia did not return safely to earth,
+[03:05.000 --> 03:14.000]   May God bless the grieving families and may God continue to bless America.
-[03:01.000 --> 03:05.000]   yet we can pray that all are safely home.
+[03:14.000 --> 03:24.000]   [Music]
 [03:05.000 --> 03:13.000]   May God bless the grieving families, and may God continue to bless America.
 [03:13.000 --> 03:41.000]   Audio
-whisper_print_timings:     load time =   575.92 ms
+main:     load time =   522.18 ms
-whisper_print_timings:      mel time =   230.60 ms
+main:      mel time =   423.43 ms
-whisper_print_timings:   sample time =    73.19 ms
+main:   sample time =    31.42 ms
-whisper_print_timings:   encode time = 19552.61 ms / 814.69 ms per layer
+main:   encode time = 41518.51 ms / 1729.94 ms per layer
-whisper_print_timings:   decode time = 13249.96 ms / 552.08 ms per layer
+main:   decode time = 14907.22 ms
-whisper_print_timings:    total time = 33686.27 ms
+main:    total time = 57416.63 ms
 ```
 </details>
 ## Real-time audio input example
 This is a naive example of performing real-time inference on audio from your microphone.
-The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continously.
+The `stream` tool samples the audio every 3 seconds and runs the transcription continously. More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
 ```java
-./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
+$ ./stream -m models/ggml-small.en.bin -t 8
 ```
-https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
+https://user-images.githubusercontent.com/1991296/193465125-c163d304-64f6-4f5d-83e5-72239c9a203e.mp4
-## Confidence color-coding
+## Implementation details
-Adding the `--print-colors` argument will print the transcribed text using an experimental color coding strategy
+- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
-to highlight words with high or low confidence:
+- The high-level C-style API is implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
 - Simple usage is demonstrated in [main.cpp](main.cpp)
 - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](stream.cpp)
-<img width="965" alt="image" src="https://user-images.githubusercontent.com/1991296/197356445-311c8643-9397-4e5e-b46e-0b4b4daa2530.png">
+## Limitations
-## Controlling the length of the generated text segments (experimental)
+- Very basic greedy sampling scheme - always pick up the top token. You can implement your own strategy
 - Inference only
 - No GPU support
-For example, to limit the line length to a maximum of 16 characters, simply add `-ml 16`: 
+## Memory usage
-```java
+| Model  | Disk   | Mem     |
-./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 16
+| ---    | ---    | ---     |
-
+| tiny   |  75 MB | ~240 MB |
-whisper_model_load: loading model from './models/ggml-base.en.bin'
+| base   | 142 MB | ~380 MB |
-...
+| small  | 466 MB | ~970 MB |
-system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | 
+| medium | 1.5 GB | ~2.5 GB |
-
+| large  | 2.9 GB | ~4.6 GB |
 main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
 [00:00:00.000 --> 00:00:00.850]   And so my
 [00:00:00.850 --> 00:00:01.590]   fellow
 [00:00:01.590 --> 00:00:04.140]   Americans, ask
 [00:00:04.140 --> 00:00:05.660]   not what your
 [00:00:05.660 --> 00:00:06.840]   country can do
 [00:00:06.840 --> 00:00:08.430]   for you, ask
 [00:00:08.430 --> 00:00:09.440]   what you can do
 [00:00:09.440 --> 00:00:10.020]   for your
 [00:00:10.020 --> 00:00:11.000]   country.
 ```
 ## Word-level timestamp
 The `--max-len` argument can be used to obtain word-level timestamps. Simply use `-ml 1`:
 ```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -ml 1
 whisper_model_load: loading model from './models/ggml-base.en.bin'
 ...
 system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | 
 main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
 [00:00:00.000 --> 00:00:00.320]  
 [00:00:00.320 --> 00:00:00.370]   And
 [00:00:00.370 --> 00:00:00.690]   so
 [00:00:00.690 --> 00:00:00.850]   my
 [00:00:00.850 --> 00:00:01.590]   fellow
 [00:00:01.590 --> 00:00:02.850]   Americans
 [00:00:02.850 --> 00:00:03.300]  ,
 [00:00:03.300 --> 00:00:04.140]   ask
 [00:00:04.140 --> 00:00:04.990]   not
 [00:00:04.990 --> 00:00:05.410]   what
 [00:00:05.410 --> 00:00:05.660]   your
 [00:00:05.660 --> 00:00:06.260]   country
 [00:00:06.260 --> 00:00:06.600]   can
 [00:00:06.600 --> 00:00:06.840]   do
 [00:00:06.840 --> 00:00:07.010]   for
 [00:00:07.010 --> 00:00:08.170]   you
 [00:00:08.170 --> 00:00:08.190]  ,
 [00:00:08.190 --> 00:00:08.430]   ask
 [00:00:08.430 --> 00:00:08.910]   what
 [00:00:08.910 --> 00:00:09.040]   you
 [00:00:09.040 --> 00:00:09.320]   can
 [00:00:09.320 --> 00:00:09.440]   do
 [00:00:09.440 --> 00:00:09.760]   for
 [00:00:09.760 --> 00:00:10.020]   your
 [00:00:10.020 --> 00:00:10.510]   country
 [00:00:10.510 --> 00:00:11.000]  .
 ```
 ## Karaoke-style movie generation (experimental)
 The [main](examples/main) example provides support for output of karaoke-style movies, where the
 currently pronounced word is highlighted. Use the `-wts` argument and run the generated bash script.
 This requires to have `ffmpeg` installed.
 Here are a few *"typical"* examples:
 ```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/jfk.wav -owts
 source ./samples/jfk.wav.wts
 ffplay ./samples/jfk.wav.mp4
 ```
 https://user-images.githubusercontent.com/1991296/199337465-dbee4b5e-9aeb-48a3-b1c6-323ac4db5b2c.mp4
 ---
 ```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/mm0.wav -owts
 source ./samples/mm0.wav.wts
 ffplay ./samples/mm0.wav.mp4
 ```
 https://user-images.githubusercontent.com/1991296/199337504-cc8fd233-0cb7-4920-95f9-4227de3570aa.mp4
 ---
 ```java
 ./main -m ./models/ggml-base.en.bin -f ./samples/gb0.wav -owts
 source ./samples/gb0.wav.wts
 ffplay ./samples/gb0.wav.mp4
 ```
 https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a0cd-f28a317987ba.mp4
 ---
 ## Benchmarks
 In order to have an objective comparison of the performance of the inference across different system configurations,
 use the [bench](examples/bench) tool. The tool simply runs the Encoder part of the model and prints how much time it
 took to execute it. The results are summarized in the following Github issue:
 [Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
 ## ggml format
@ -428,21 +248,6 @@ The original models are converted to a custom binary format. This allows to pack
 - vocabulary
 - weights
-You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script or from here:
+You can download the converted models using the [download-ggml-model.sh](download-ggml-model.sh) script.
-https://ggml.ggerganov.com
+For more details, see the conversion script [convert-pt-to-ggml.py](convert-pt-to-ggml.py) or the README in [models](models).
 For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README in [models](models).
 ## Bindings
 - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm)
 - [ ] Python:
 - [ ] Java:
 ## Examples
 There are various examples of using the library for different projects in the [examples](examples) folder. Check them out!
 ## [Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126)
--- a/bindings/CMakeLists.txt
+++ b/bindings/CMakeLists.txt
@ -1,3 +0,0 @@
 if (EMSCRIPTEN)
    add_subdirectory(javascript)
 endif()
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/.gitignore
+++ b/bindings/javascript/.gitignore
@ -1 +0,0 @@
 publish.log
--- a/bindings/javascript/CMakeLists.txt
+++ b/bindings/javascript/CMakeLists.txt
@ -1,33 +0,0 @@
 set(TARGET libwhisper)
 add_executable(${TARGET}
    emscripten.cpp
    )
 target_link_libraries(${TARGET} PRIVATE
    whisper
    )
 unset(EXTRA_FLAGS)
 if (WHISPER_WASM_SINGLE_FILE)
    set(EXTRA_FLAGS "-s SINGLE_FILE=1")
    message(STATUS "Embedding WASM inside whisper.js")
    add_custom_command(
        TARGET libwhisper POST_BUILD
        COMMAND ${CMAKE_COMMAND} -E copy
        ${CMAKE_BINARY_DIR}/bin/libwhisper.js
        ${CMAKE_CURRENT_SOURCE_DIR}/whisper.js
        )
 endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
    -s PTHREAD_POOL_SIZE=8 \
    -s INITIAL_MEMORY=1610612736 \
    -s TOTAL_MEMORY=1610612736 \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
    ")
--- a/bindings/javascript/emscripten.cpp
+++ b/bindings/javascript/emscripten.cpp
@ -1,89 +0,0 @@
 #include "whisper.h"
 #include <emscripten.h>
 #include <emscripten/bind.h>
 #include <vector>
 #include <thread>
 std::vector<struct whisper_context *> g_contexts(4, nullptr);
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        for (size_t i = 0; i < g_contexts.size(); ++i) {
            if (g_contexts[i] == nullptr) {
                g_contexts[i] = whisper_init(path_model.c_str());
                if (g_contexts[i] != nullptr) {
                    return i + 1;
                } else {
                    return (size_t) 0;
                }
            }
        }
        return (size_t) 0;
    }));
    emscripten::function("free", emscripten::optional_override([](size_t index) {
        --index;
        if (index < g_contexts.size()) {
            whisper_free(g_contexts[index]);
            g_contexts[index] = nullptr;
        }
    }));
    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
        --index;
        if (index >= g_contexts.size()) {
            return -1;
        }
        if (g_contexts[index] == nullptr) {
            return -2;
        }
        struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
        params.print_realtime       = true;
        params.print_progress       = false;
        params.print_timestamps     = true;
        params.print_special_tokens = false;
        params.translate            = translate;
        params.language             = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
        params.n_threads            = std::min(8, (int) std::thread::hardware_concurrency());
        params.offset_ms            = 0;
        std::vector<float> pcmf32;
        const int n = audio["length"].as<int>();
        emscripten::val heap = emscripten::val::module_property("HEAPU8");
        emscripten::val memory = heap["buffer"];
        pcmf32.resize(n);
        emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(pcmf32.data()), n);
        memoryView.call<void>("set", audio);
        // print system information
        {
            printf("system_info: n_threads = %d / %d | %s\n",
                    params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
            printf("%s: processing %d samples, %.1f sec, %d threads, %d processors, lang = %s, task = %s ...\n",
                    __func__, int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
                    params.n_threads, 1,
                    params.language,
                    params.translate ? "translate" : "transcribe");
            printf("\n");
        }
        int ret = whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
        whisper_print_timings(g_contexts[index]);
        return ret;
    }));
 }
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/cmake/BuildTypes.cmake
+++ b/cmake/BuildTypes.cmake
@ -1,54 +0,0 @@
 # Add new build types
 # ReleaseGG - Release with enabled asserts
 SET(CMAKE_CXX_FLAGS_RELEASEGG
    "-O3"
    CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts."
    FORCE )
 SET(CMAKE_C_FLAGS_RELEASEGG
    "-O3"
    CACHE STRING "Flags used by the compiler during release builds with enabled asserts."
    FORCE )
 SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG
    ""
    CACHE STRING "Flags used for linking binaries during release builds with enabled asserts."
    FORCE )
 SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG
    ""
    CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts."
    FORCE )
 MARK_AS_ADVANCED(
    CMAKE_CXX_FLAGS_RELEASEGG
    CMAKE_C_FLAGS_RELEASEGG
    CMAKE_EXE_LINKER_FLAGS_RELEASEGG
    CMAKE_SHARED_LINKER_FLAGS_RELEASEGG )
 # RelWithDebInfoGG - RelWithDebInfo with enabled asserts
 SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
    "-O2 -g"
    CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts."
    FORCE )
 SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG
    "-O2 -g"
    CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts."
    FORCE )
 SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
    ""
    CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts."
    FORCE )
 SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG
    ""
    CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts."
    FORCE )
 MARK_AS_ADVANCED(
    CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
    CMAKE_C_FLAGS_RELWITHDEBINFOGG
    CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
    CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG )
 if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
    set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG")
 endif()
--- a/cmake/GitVars.cmake
+++ b/cmake/GitVars.cmake
@ -1,22 +0,0 @@
 find_package(Git)
 # the commit's SHA1
 execute_process(COMMAND
    "${GIT_EXECUTABLE}" describe --match=NeVeRmAtCh --always --abbrev=8
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_SHA1
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 # the date of the commit
 execute_process(COMMAND
    "${GIT_EXECUTABLE}" log -1 --format=%ad --date=local
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_DATE
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 # the subject of the commit
 execute_process(COMMAND
    "${GIT_EXECUTABLE}" log -1 --format=%s
    WORKING_DIRECTORY "${CMAKE_SOURCE_DIR}"
    OUTPUT_VARIABLE GIT_COMMIT_SUBJECT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
--- a/models/convert-pt-to-ggml.py
+++ b/models/convert-pt-to-ggml.py
@ -234,7 +234,7 @@ dir_tokenizer = tokenizer.name_or_path
 # output in the same directory as the model
 fname_out = dir_out + "/ggml-model.bin"
-with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
+with open(dir_tokenizer + "/vocab.json", "r") as f:
    tokens = json.load(f)
 # use 16-bit or 32-bit floats
@ -271,7 +271,7 @@ byte_decoder = {v:k for k, v in byte_encoder.items()}
 fout.write(struct.pack("i", len(tokens)))
 for key in tokens:
-    text = bytearray([byte_decoder[c] for c in key])
+    text = bytearray([byte_decoder[c] for c in key]).decode('utf-8', errors='replace').encode('utf-8')
    fout.write(struct.pack("i", len(text)))
    fout.write(text)
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -3,17 +3,7 @@
 # This script downloads Whisper model files that have already been converted to ggml format.
 # This way you don't have to convert them yourself.
-# get the path of this script
+ggml_path=$(dirname $(realpath $0))
 function get_script_path() {
    if [ -x "$(command -v realpath)" ]; then
        echo "$(dirname $(realpath $0))"
    else
        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
        echo "$ret"
    fi
 }
 models_path=$(get_script_path)
 # Whisper models
 models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
@ -48,22 +38,14 @@ fi
 printf "Downloading ggml model $model ...\n"
-cd $models_path
+mkdir -p models
-if [ -f "ggml-$model.bin" ]; then
+if [ -f "models/ggml-$model.bin" ]; then
    printf "Model $model already exists. Skipping download.\n"
    exit 0
 fi
-if [ -x "$(command -v wget)" ]; then
+wget --quiet --show-progress -O models/ggml-$model.bin https://ggml.ggerganov.com/ggml-model-whisper-$model.bin
    wget --quiet --show-progress -O ggml-$model.bin https://ggml.ggerganov.com/ggml-model-whisper-$model.bin
 elif [ -x "$(command -v curl)" ]; then
    curl --output ggml-$model.bin https://ggml.ggerganov.com/ggml-model-whisper-$model.bin
 else
    printf "Either wget or curl is required to download models.\n"
    exit 1
 fi
 if [ $? -ne 0 ]; then
    printf "Failed to download ggml model $model \n"
--- a/examples/dr_wav.h
+++ b/examples/dr_wav.h
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -1,27 +0,0 @@
 # dependencies
 find_package(Threads REQUIRED)
 # third-party
 if (WHISPER_SUPPORT_SDL2)
    # SDL2
    find_package(SDL2 REQUIRED)
    string(STRIP "${SDL2_LIBRARIES}" SDL2_LIBRARIES)
    message(STATUS "SDL2_INCLUDE_DIRS = ${SDL2_INCLUDE_DIRS}")
    message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
 endif()
 # examples
 include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 if (EMSCRIPTEN)
    add_subdirectory(whisper.wasm)
 else()
    add_subdirectory(main)
    add_subdirectory(stream)
    add_subdirectory(bench)
 endif()
--- a/examples/bench/CMakeLists.txt
+++ b/examples/bench/CMakeLists.txt
@ -1,3 +0,0 @@
 set(TARGET bench)
 add_executable(${TARGET} bench.cpp)
 target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/bench/README.md
+++ b/examples/bench/README.md
@ -1,52 +0,0 @@
 # bench
 A very basic tool for benchmarking the inference performance on your device. The tool simply runs the Encoder part of the transformer on some random audio data and records the execution time. This way we can have an objective comparison of the performance of the model for various setups.
 Benchmark results are tracked in the following Github issue: https://github.com/ggerganov/whisper.cpp/issues/89
 ```bash
 # build the bench tool
 $ make bench
 # run it on the small.en model using 4 threads
 $ ./bench -m ./models/ggml-small.en.bin -t 4
 whisper_model_load: loading model from './models/ggml-small.en.bin'
 whisper_model_load: n_vocab       = 51864
 whisper_model_load: n_audio_ctx   = 1500
 whisper_model_load: n_audio_state = 768
 whisper_model_load: n_audio_head  = 12
 whisper_model_load: n_audio_layer = 12
 whisper_model_load: n_text_ctx    = 448
 whisper_model_load: n_text_state  = 768
 whisper_model_load: n_text_head   = 12
 whisper_model_load: n_text_layer  = 12
 whisper_model_load: n_mels        = 80
 whisper_model_load: f16           = 1
 whisper_model_load: type          = 3
 whisper_model_load: mem_required  = 1048.00 MB
 whisper_model_load: adding 1607 extra tokens
 whisper_model_load: ggml ctx size = 533.05 MB
 whisper_model_load: memory size =    68.48 MB 
 whisper_model_load: model size  =   464.44 MB
 whisper_print_timings:     load time =   240.82 ms
 whisper_print_timings:      mel time =     0.00 ms
 whisper_print_timings:   sample time =     0.00 ms
 whisper_print_timings:   encode time =  1062.21 ms / 88.52 ms per layer
 whisper_print_timings:   decode time =     0.00 ms / 0.00 ms per layer
 whisper_print_timings:    total time =  1303.04 ms
 system_info: n_threads = 4 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | 
 If you wish, you can submit these results here:
  https://github.com/ggerganov/whisper.cpp/issues/89
 Please include the following information:
  - CPU model
  - Operating system
  - Compiler
 ```
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@ -1,95 +0,0 @@
 #include "whisper.h"
 #include <cstdio>
 #include <string>
 #include <thread>
 // command-line parameters
 struct whisper_params {
    int32_t n_threads   = std::min(4, (int32_t) std::thread::hardware_concurrency());
    std::string model     = "models/ggml-base.en.bin";
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
        if (arg == "-t" || arg == "--threads") {
            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }
    return true;
 }
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
 }
 int main(int argc, char ** argv) {
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }
    // whisper init
    struct whisper_context * ctx = whisper_init(params.model.c_str());
    {
        fprintf(stderr, "\n");
        fprintf(stderr, "system_info: n_threads = %d / %d | %s\n", params.n_threads, std::thread::hardware_concurrency(), whisper_print_system_info());
    }
    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 2;
    }
    if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
        fprintf(stderr, "error: failed to set mel: %d\n", ret);
        return 3;
    }
    if (int ret = whisper_encode(ctx, 0, params.n_threads) != 0) {
        fprintf(stderr, "error: failed to encode model: %d\n", ret);
        return 4;
    }
    whisper_print_timings(ctx);
    whisper_free(ctx);
    fprintf(stderr, "\n");
    fprintf(stderr, "If you wish, you can submit these results here:\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  https://github.com/ggerganov/whisper.cpp/issues/89\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "Please include the following information:\n");
    fprintf(stderr, "\n");
    fprintf(stderr, "  - CPU model\n");
    fprintf(stderr, "  - Operating system\n");
    fprintf(stderr, "  - Compiler\n");
    fprintf(stderr, "\n");
    return 0;
 }
--- a/examples/generate-karaoke.sh
+++ b/examples/generate-karaoke.sh
@ -1,49 +0,0 @@
 #!/bin/bash
 executable="./main"
 model="base.en"
 model_path="models/ggml-$model.bin"
 # require sox and ffmpeg to be installed
 if ! command -v sox &> /dev/null
 then
    echo "sox could not be found"
    exit 1
 fi
 if ! command -v ffmpeg &> /dev/null
 then
    echo "ffmpeg could not be found"
    exit 2
 fi
 if [ ! -f "$executable" ]; then
    echo "'$executable' does not exist. Please build it first."
    exit 3
 fi
 if [ ! -f "$model_path" ]; then
    echo "'$model_path' does not exist. Please download it first."
    exit 4
 fi
 # record some raw audio
 sox -d rec.wav
 # resample to 16kHz
 ffmpeg -y -i ./rec.wav -ar 16000 -ac 1 -c:a pcm_s16le ./rec16.wav > /dev/null 2>&1
 # run Whisper
 echo "Processing ..."
 ./main -m models/ggml-base.en.bin rec16.wav -owts > /dev/null 2>&1
 # generate Karaoke video
 echo "Generating video ..."
 source rec16.wav.wts > /dev/null 2>&1
 # play the video
 echo "Playing ./rec16.wav.mp4 ..."
 ffplay -loglevel 0 -autoexit ./rec16.wav.mp4
 echo "Done"
 exit 0
--- a/examples/main/CMakeLists.txt
+++ b/examples/main/CMakeLists.txt
@ -1,3 +0,0 @@
 set(TARGET main)
 add_executable(${TARGET} main.cpp)
 target_link_libraries(${TARGET} PRIVATE whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -1,34 +0,0 @@
 # main
 This is the main example demonstrating most of the functionality of the Whisper model.
 It can be used as a reference for using the `whisper.cpp` library in other projects.
 ```
 ./main -h
 usage: ./bin/main [options] file0.wav file1.wav ...
  -h,       --help           show this help message and exit
  -s SEED,  --seed SEED      RNG seed (default: -1)
  -t N,     --threads N      number of threads to use during computation (default: 4)
  -p N,     --processors N   number of processors to use during computation (default: 1)
  -ot N,    --offset-t N     time offset in milliseconds (default: 0)
  -on N,    --offset-n N     segment index offset (default: 0)
  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)
  -ml N,    --max-len N      maximum segment length in characters (default: 0)
  -wt N,    --word-thold N   word timestamp probability threshold (default: 0.010000)
  -v,       --verbose        verbose output
            --translate      translate from source language to english
  -otxt,    --output-txt     output result in a text file
  -ovtt,    --output-vtt     output result in a vtt file
  -osrt,    --output-srt     output result in a srt file
  -owts,    --output-words   output script for generating karaoke video
  -ps,      --print_special  print special tokens
  -pc,      --print_colors   print colors
  -nt,      --no_timestamps  do not print timestamps
  -l LANG,  --language LANG  spoken language (default: en)
  -m FNAME, --model FNAME    model path (default: models/ggml-base.en.bin)
  -f FNAME, --file FNAME     input WAV file path
  -h,       --help           show this help message and exit
 ```
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -1,591 +0,0 @@
 #include "whisper.h"
 // third-party utilities
 // use your favorite implementations
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"
 #include <cmath>
 #include <fstream>
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
 // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
 // Lowest is red, middle is yellow, highest is green.
 const std::vector<std::string> k_colors = {
    "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
    "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
 };
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 std::string to_timestamp(int64_t t, bool comma = false) {
    int64_t msec = t * 10;
    int64_t hr = msec / (1000 * 60 * 60);
    msec = msec - hr * (1000 * 60 * 60);
    int64_t min = msec / (1000 * 60);
    msec = msec - min * (1000 * 60);
    int64_t sec = msec / 1000;
    msec = msec - sec * 1000;
    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
    return std::string(buf);
 }
 // helper function to replace substrings
 void replace_all(std::string & s, const std::string & search, const std::string & replace) {
    for (size_t pos = 0; ; pos += replace.length()) {
        pos = s.find(search, pos);
        if (pos == std::string::npos) break;
        s.erase(pos, search.length());
        s.insert(pos, replace);
    }
 }
 // command-line parameters
 struct whisper_params {
    int32_t seed         = -1; // RNG seed, not used currently
    int32_t n_threads    = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_processors = 1;
    int32_t offset_t_ms  = 0;
    int32_t offset_n     = 0;
    int32_t duration_ms  = 0;
    int32_t max_context  = -1;
    int32_t max_len      = 0;
    float word_thold = 0.01f;
    bool verbose              = false;
    bool translate            = false;
    bool output_txt           = false;
    bool output_vtt           = false;
    bool output_srt           = false;
    bool output_wts           = false;
    bool print_special_tokens = false;
    bool print_colors         = false;
    bool no_timestamps        = false;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
    std::vector<std::string> fname_inp = {};
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
        if (arg[0] != '-') {
            params.fname_inp.push_back(arg);
            continue;
        }
        if (arg == "-s" || arg == "--seed") {
            params.seed = std::stoi(argv[++i]);
        } else if (arg == "-t" || arg == "--threads") {
            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "-p" || arg == "--processors") {
            params.n_processors = std::stoi(argv[++i]);
        } else if (arg == "-ot" || arg == "--offset-t") {
            params.offset_t_ms = std::stoi(argv[++i]);
        } else if (arg == "-on" || arg == "--offset-n") {
            params.offset_n = std::stoi(argv[++i]);
        } else if (arg == "-d" || arg == "--duration") {
            params.duration_ms = std::stoi(argv[++i]);
        } else if (arg == "-mc" || arg == "--max-context") {
            params.max_context = std::stoi(argv[++i]);
        } else if (arg == "-ml" || arg == "--max-len") {
            params.max_len = std::stoi(argv[++i]);
        } else if (arg == "-wt" || arg == "--word-thold") {
            params.word_thold = std::stof(argv[++i]);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else if (arg == "--translate") {
            params.translate = true;
        } else if (arg == "-l" || arg == "--language") {
            params.language = argv[++i];
            if (whisper_lang_id(params.language.c_str()) == -1) {
                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
                whisper_print_usage(argc, argv, params);
                exit(0);
            }
        } else if (arg == "-otxt" || arg == "--output-txt") {
            params.output_txt = true;
        } else if (arg == "-ovtt" || arg == "--output-vtt") {
            params.output_vtt = true;
        } else if (arg == "-osrt" || arg == "--output-srt") {
            params.output_srt = true;
        } else if (arg == "-owts" || arg == "--output-words") {
            params.output_wts = true;
        } else if (arg == "-ps" || arg == "--print_special") {
            params.print_special_tokens = true;
        } else if (arg == "-pc" || arg == "--print_colors") {
            params.print_colors = true;
        } else if (arg == "-nt" || arg == "--no_timestamps") {
            params.no_timestamps = true;
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-f" || arg == "--file") {
            params.fname_inp.push_back(argv[++i]);
        } else if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }
    return true;
 }
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -p N,     --processors N   number of processors to use during computation (default: %d)\n", params.n_processors);
    fprintf(stderr, "  -ot N,    --offset-t N     time offset in milliseconds (default: %d)\n", params.offset_t_ms);
    fprintf(stderr, "  -on N,    --offset-n N     segment index offset (default: %d)\n", params.offset_n);
    fprintf(stderr, "  -d  N,    --duration N     duration of audio to process in milliseconds (default: %d)\n", params.duration_ms);
    fprintf(stderr, "  -mc N,    --max-context N  maximum number of text context tokens to store (default: max)\n");
    fprintf(stderr, "  -ml N,    --max-len N      maximum segment length in characters (default: %d)\n", params.max_len);
    fprintf(stderr, "  -wt N,    --word-thold N   word timestamp probability threshold (default: %f)\n", params.word_thold);
    fprintf(stderr, "  -v,       --verbose        verbose output\n");
    fprintf(stderr, "            --translate      translate from source language to english\n");
    fprintf(stderr, "  -otxt,    --output-txt     output result in a text file\n");
    fprintf(stderr, "  -ovtt,    --output-vtt     output result in a vtt file\n");
    fprintf(stderr, "  -osrt,    --output-srt     output result in a srt file\n");
    fprintf(stderr, "  -owts,    --output-words   output script for generating karaoke video\n");
    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
    fprintf(stderr, "  -pc,      --print_colors   print colors\n");
    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME     input WAV file path\n");
    fprintf(stderr, "\n");
 }
 void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
    const whisper_params & params = *(whisper_params *) user_data;
    const int n_segments = whisper_full_n_segments(ctx);
    // print the last n_new segments
    const int s0 = n_segments - n_new;
    if (s0 == 0) {
        printf("\n");
    }
    for (int i = s0; i < n_segments; i++) {
        if (params.no_timestamps) {
            if (params.print_colors) {
                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
                    if (params.print_special_tokens == false) {
                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                        if (id >= whisper_token_eot(ctx)) {
                            continue;
                        }
                    }
                    const char * text = whisper_full_get_token_text(ctx, i, j);
                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
                }
            } else {
                const char * text = whisper_full_get_segment_text(ctx, i);
                printf("%s", text);
            }
            fflush(stdout);
        } else {
            const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
            const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
            if (params.print_colors) {
                printf("[%s --> %s]  ", to_timestamp(t0).c_str(), to_timestamp(t1).c_str());
                for (int j = 0; j < whisper_full_n_tokens(ctx, i); ++j) {
                    if (params.print_special_tokens == false) {
                        const whisper_token id = whisper_full_get_token_id(ctx, i, j);
                        if (id >= whisper_token_eot(ctx)) {
                            continue;
                        }
                    }
                    const char * text = whisper_full_get_token_text(ctx, i, j);
                    const float  p    = whisper_full_get_token_p   (ctx, i, j);
                    const int col = std::max(0, std::min((int) k_colors.size(), (int) (std::pow(p, 3)*float(k_colors.size()))));
                    printf("%s%s%s", k_colors[col].c_str(), text, "\033[0m");
                }
                printf("\n");
            } else {
                const char * text = whisper_full_get_segment_text(ctx, i);
                printf("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
            }
        }
    }
 }
 bool output_txt(struct whisper_context * ctx, const char * fname) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
        return false;
    }
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        fout << text;
    }
    return true;
 }
 bool output_vtt(struct whisper_context * ctx, const char * fname) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
        return 9;
    }
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
    fout << "WEBVTT\n\n";
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
        fout << to_timestamp(t0) << " --> " << to_timestamp(t1) << "\n";
        fout << text << "\n\n";
    }
    return true;
 }
 bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
        return false;
    }
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
        fout << i + 1 + params.offset_n << "\n";
        fout << to_timestamp(t0, true) << " --> " << to_timestamp(t1, true) << "\n";
        fout << text << "\n\n";
    }
    return true;
 }
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
 bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
    std::ofstream fout(fname);
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
    // TODO: become parameter
    static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
    fout << "#!/bin/bash" << "\n";
    fout << "\n";
    fout << "ffmpeg -i " << fname_inp << " -f lavfi -i color=size=1200x120:duration=" << t_sec << ":rate=25:color=black -vf \"";
    for (int i = 0; i < whisper_full_n_segments(ctx); i++) {
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
        const int n = whisper_full_n_tokens(ctx, i);
        std::vector<whisper_token_data> tokens(n);
        for (int j = 0; j < n; ++j) {
            tokens[j] = whisper_full_get_token_data(ctx, i, j);
        }
        if (i > 0) {
            fout << ",";
        }
        // background text
        fout << "drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='':enable='between(t," << t0/100.0 << "," << t0/100.0 << ")'";
        bool is_first = true;
        for (int j = 0; j < n; ++j) {
            const auto & token = tokens[j];
            if (tokens[j].id >= whisper_token_eot(ctx)) {
                continue;
            }
            std::string txt_bg;
            std::string txt_fg; // highlight token
            std::string txt_ul; // underline
            txt_bg = "> ";
            txt_fg = "> ";
            txt_ul = "\\ \\ ";
            {
                int ncnt = 0;
                for (int k = 0; k < n; ++k) {
                    const auto & token2 = tokens[k];
                    if (tokens[k].id >= whisper_token_eot(ctx)) {
                        continue;
                    }
                    const std::string txt = whisper_token_to_str(ctx, token2.id);
                    txt_bg += txt;
                    if (k == j) {
                        for (int l = 0; l < (int) txt.size(); ++l) {
                            txt_fg += txt[l];
                            txt_ul += "_";
                        }
                        txt_fg += "|";
                    } else {
                        for (int l = 0; l < (int) txt.size(); ++l) {
                            txt_fg += "\\ ";
                            txt_ul += "\\ ";
                        }
                    }
                    ncnt += txt.size();
                }
                ::replace_all(txt_bg, "'", "’");
                ::replace_all(txt_bg, "\"", "\\\"");
                ::replace_all(txt_fg, "'", "’");
                ::replace_all(txt_fg, "\"", "\\\"");
            }
            if (is_first) {
                // background text
                fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=gray:x=(w-text_w)/2:y=h/2:text='" << txt_bg << "':enable='between(t," << t0/100.0 << "," << t1/100.0 << ")'";
                is_first = false;
            }
            // foreground text
            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2:text='" << txt_fg << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
            // underline
            fout << ",drawtext=fontfile='" << font << "':fontsize=24:fontcolor=lightgreen:x=(w-text_w)/2+8:y=h/2+16:text='" << txt_ul << "':enable='between(t," << token.t0/100.0 << "," << token.t1/100.0 << ")'";
        }
    }
    fout << "\" -c:v libx264 -pix_fmt yuv420p -y " << fname_inp << ".mp4" << "\n";
    fout << "\n\n";
    fout << "echo \"Your video has been saved to " << fname_inp << ".mp4\"" << "\n";
    fout << "\n";
    fout << "echo \"  ffplay " << fname_inp << ".mp4\"\n";
    fout << "\n";
    fout.close();
    fprintf(stderr, "%s: run 'source %s' to generate karaoke video\n", __func__, fname);
    return true;
 }
 int main(int argc, char ** argv) {
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }
    if (params.seed < 0) {
        params.seed = time(NULL);
    }
    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        whisper_print_usage(argc, argv, params);
        return 2;
    }
    // whisper init
    struct whisper_context * ctx = whisper_init(params.model.c_str());
    if (ctx == nullptr) {
        fprintf(stderr, "error: failed to initialize whisper context\n");
        return 3;
    }
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
        // WAV input
        std::vector<float> pcmf32;
        {
            drwav wav;
            if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
                fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
                whisper_print_usage(argc, argv, {});
                return 4;
            }
            if (wav.channels != 1 && wav.channels != 2) {
                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
                return 5;
            }
            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
                return 6;
            }
            if (wav.bitsPerSample != 16) {
                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
                return 7;
            }
            int n = wav.totalPCMFrameCount;
            std::vector<int16_t> pcm16;
            pcm16.resize(n*wav.channels);
            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
            drwav_uninit(&wav);
            // convert to mono, float
            pcmf32.resize(n);
            if (wav.channels == 1) {
                for (int i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[i])/32768.0f;
                }
            } else {
                for (int i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
                }
            }
        }
        // print system information
        {
            fprintf(stderr, "\n");
            fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
                    params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
        }
        // print some info about the processing
        {
            fprintf(stderr, "\n");
            if (!whisper_is_multilingual(ctx)) {
                if (params.language != "en" || params.translate) {
                    params.language = "en";
                    params.translate = false;
                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
                }
            }
            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
                    params.n_threads, params.n_processors,
                    params.language.c_str(),
                    params.translate ? "translate" : "transcribe",
                    params.no_timestamps ? 0 : 1);
            fprintf(stderr, "\n");
        }
        // run the inference
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
            wparams.print_realtime       = false;
            wparams.print_progress       = false;
            wparams.print_timestamps     = !params.no_timestamps;
            wparams.print_special_tokens = params.print_special_tokens;
            wparams.translate            = params.translate;
            wparams.language             = params.language.c_str();
            wparams.n_threads            = params.n_threads;
            wparams.n_max_text_ctx       = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
            wparams.offset_ms            = params.offset_t_ms;
            wparams.duration_ms          = params.duration_ms;
            wparams.token_timestamps     = params.output_wts || params.max_len > 0;
            wparams.thold_pt             = params.word_thold;
            wparams.max_len              = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
            // this callback is called on each new segment
            if (!wparams.print_realtime) {
                wparams.new_segment_callback           = whisper_print_segment_callback;
                wparams.new_segment_callback_user_data = &params;
            }
            if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                return 8;
            }
        }
        // output stuff
        {
            printf("\n");
            // output to text file
            if (params.output_txt) {
                const auto fname_txt = fname_inp + ".txt";
                output_txt(ctx, fname_txt.c_str());
            }
            // output to VTT file
            if (params.output_vtt) {
                const auto fname_vtt = fname_inp + ".vtt";
                output_vtt(ctx, fname_vtt.c_str());
            }
            // output to SRT file
            if (params.output_srt) {
                const auto fname_srt = fname_inp + ".srt";
                output_srt(ctx, fname_srt.c_str(), params);
            }
            // output to WTS file
            if (params.output_wts) {
                const auto fname_wts = fname_inp + ".wts";
                output_wts(ctx, fname_wts.c_str(), fname_inp.c_str(), params, float(pcmf32.size() + 1000)/WHISPER_SAMPLE_RATE);
            }
        }
    }
    whisper_print_timings(ctx);
    whisper_free(ctx);
    return 0;
 }
--- a/examples/stream/CMakeLists.txt
+++ b/examples/stream/CMakeLists.txt
@ -1,7 +0,0 @@
 if (WHISPER_SUPPORT_SDL2)
    # stream
    set(TARGET stream)
    add_executable(${TARGET} stream.cpp)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
    target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/stream/README.md
+++ b/examples/stream/README.md
@ -1,23 +0,0 @@
 # stream
 This is a naive example of performing real-time inference on audio from your microphone.
 The `stream` tool samples the audio every half a second and runs the transcription continously.
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
 ```java
 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
 ```
 https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4
 The `stream` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
 ```bash
 # Install SDL2 on Linux
 sudo apt-get install libsdl2-dev
 # Install SDL2 on Mac OS
 brew install sdl2
 make stream
 ```
--- a/examples/whisper.nvim/README.md
+++ b/examples/whisper.nvim/README.md
@ -1,92 +0,0 @@
 # whisper.nvim
 Speech-to-text in Neovim
 The transcription is performed on the CPU and no data leaves your computer. Works best on Apple Silicon devices.
 https://user-images.githubusercontent.com/1991296/198382564-784e9663-2037-4d04-99b8-f39136929b7e.mp4
 ## Usage
 - Simply press `Ctrl-G` in `INSERT`, `VISUAL` or `NORMAL` mode and say something
 - When you are done - press `Ctrl-C` to end the transcription and insert the transcribed text under the cursor
 ## Installation
 *Note: this is a bit tedious and hacky atm, but I hope it will be improved with time*
 - Clone this repo and build the `stream` tool:
  ```
  git clone https://github.com/ggerganov/whisper.cpp
  cd whisper.cpp
  make stream
  ```
 - Download the `base.en` Whisper model (140 MB):
  ```
  ./models/download-ggml-model.sh base.en
  ```
 - Place the [whisper.nvim](whisper.nvim) script somewhere in your PATH and give it execute permissions:
  ```
  cp examples/whisper.nvim/whisper.nvim ~/bin/
  chmod u+x ~/bin/whisper.nvim
  ```
 - Fine-tune the script to your preference and machine parameters:
  ```
  ./stream -t 8 -m models/ggml-base.en.bin --step 350 --length 10000 -f /tmp/whisper.nvim 2> /dev/null
  ```
  On slower machines, try to increase the `step` parameter.
 - Add the following shortcuts to your `~/.config/nvim/init.vim`:
  ```
  inoremap <C-G>  <C-O>:!whisper.nvim<CR><C-O>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR><C-R>a
  nnoremap <C-G>       :!whisper.nvim<CR>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR>"ap
  vnoremap <C-G> c<C-O>:!whisper.nvim<CR><C-O>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR><C-R>a
  ```
  Explanation: pressing `Ctrl-G` runs the [whisper.nvim](whisper.nvim) script which in turn calls the `stream` binary to transcribe your speech through the microphone. The results from the transcription are continuously dumped into `/tmp/whisper.nvim`. After you kill the program with `Ctrl-C`, the vim command grabs the last line from the `/tmp/whisper.nvim` file and puts it under the cursor.
  Probably there is a much more intelligent way to achieve all this, but this is what I could hack in an hour. Any suggestions how to improve this are welcome.
 You are now ready to use speech-to-text in Neovim!
 ## TODO
 There are a lot of ways to improve this idea and I don't have much experience with Vim plugin programming, so contributions are welcome! 
 - [ ] **Wrap this into a plugin**
  It would be great to make a standalone plugin out of this that can be installed with `vim-plug` or similar
 - [ ] **Simplify the `init.vim` mappings (maybe factor out the common call into a separate function)**
 - [ ] **Add Copilot/GPT-3 integration**
  This is probably a very long shot, but I think it will be very cool to have the functionality to select some code and then hit Ctrl-G and say something like:
  *"refactor this using stl containers"*
  or
  *"optimize by sorting the data first"*
  The plugin would then make an appropriate query using the selected text and code context to Copilot or GPT-3 and return the result.
  Here is a proof-of-concept:
  https://user-images.githubusercontent.com/1991296/199078847-0278fcde-5667-4748-ba0d-7d55381d6047.mp4
  https://user-images.githubusercontent.com/1991296/200067939-f98d2ac2-7519-438a-85f9-79db0841ba4f.mp4
  For explanation how this works see: https://twitter.com/ggerganov/status/1587168771789258756
 ## Discussion
 If you find this idea interesting, you can join the discussion here: https://github.com/ggerganov/whisper.cpp/discussions/108
--- a/examples/whisper.nvim/whisper.nvim
+++ b/examples/whisper.nvim/whisper.nvim
@ -1,50 +0,0 @@
 #!/bin/bash
 # INSTRUCTIONS
 #
 # This simple script is called by Neovim to capture audio from the microphone and transcribe it with Whisper.
 # In order for this to work, you need to clone the whisper.cpp repo and build the 'stream' tool
 #
 #   git clone https://github.com/ggerganov/whisper.cpp
 #   cd whisper.cpp
 #   make stream
 #
 # Also, make sure the current script is in your PATH env variable. You should be able to run the following command:
 #
 #   whisper.nvim
 #
 # Next, export the path to the whisper.cpp repository via the WHISPER_CPP_HOME env variable:
 #
 #   export WHISPER_CPP_HOME=/path/to/whisper.cpp
 #
 # Finally, add the following lines to your ~/.config/nvim/init.vim:
 #
 #   inoremap <C-G>  <C-O>:!whisper.nvim<CR><C-O>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR><C-R>a
 #   nnoremap <C-G>       :!whisper.nvim<CR>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR>"ap
 #   vnoremap <C-G> c<C-O>:!whisper.nvim<CR><C-O>:let @a = system("cat /tmp/whisper.nvim \| tail -n 1 \| xargs -0 \| tr -d '\\n' \| sed -e 's/^[[:space:]]*//'")<CR><C-R>a
 #
 # This allows you to press Ctrl-G in order to capture audio from the microphone and transcribe it.
 # When you are done speaking - press Ctrl-C
 #
 # the Whisper model to use
 model="base.en"
 # export the path to the whisper.cpp repo in the WHISPER_CPP_HOME env variable
 # https://github.com/ggerganov/whisper.cpp
 cd ${WHISPER_CPP_HOME}
 if [ ! -f ./stream ] ; then
    echo "whisper.nvim: the 'stream' executable was not found! WHISPER_CPP_HOME=${WHISPER_CPP_HOME}" > /tmp/whisper.nvim
    exit 1
 fi
 if [ ! -f ./models/ggml-${model}.bin ] ; then
    echo "whisper.nvim: the '$model' model was not found! WHISPER_CPP_HOME=${WHISPER_CPP_HOME}" > /tmp/whisper.nvim
    exit 2
 fi
 # fine-tune the parameters according to your machine specs
 ./stream -t 8 -m models/ggml-base.en.bin --step 350 --length 10000 -f /tmp/whisper.nvim 2> /dev/null
 exit 0
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -1,17 +0,0 @@
 # whisper.objc
 Minimal Obj-C application for automatic offline speech recognition.
 The inference runs locally, on-device.
 https://user-images.githubusercontent.com/1991296/197385372-962a6dea-bca1-4d50-bf96-1d8c27b98c81.mp4
 ## Usage
 ```java
 git clone https://github.com/ggerganov/whisper.cpp
 open whisper.cpp/examples/whisper.objc/whisper.objc.xcodeproj/
 ```
 Make sure to build the project in `Release`:
 <img width="947" alt="image" src="https://user-images.githubusercontent.com/1991296/197382607-9e1e6d1b-79fa-496f-9d16-b71dc1535701.png">
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -1,382 +0,0 @@
 // !$*UTF8*$!
 {
 	archiveVersion = 1;
 	classes = {
 	};
 	objectVersion = 56;
 	objects = {
 /* Begin PBXBuildFile section */
 		18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
 		18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7D29052BDF00BD2A04 /* SceneDelegate.m */; };
 		18627C8129052BDF00BD2A04 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8029052BDF00BD2A04 /* ViewController.m */; };
 		18627C8429052BDF00BD2A04 /* Main.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8229052BDF00BD2A04 /* Main.storyboard */; };
 		18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8529052BE000BD2A04 /* Assets.xcassets */; };
 		18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */; };
 		18627C8C29052BE000BD2A04 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8B29052BE000BD2A04 /* main.m */; };
 		18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; };
 		18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE"; }; };
 		18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
 /* End PBXBuildFile section */
 /* Begin PBXFileReference section */
 		18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; };
 		18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
 		18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
 		18627C7C29052BDF00BD2A04 /* SceneDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = SceneDelegate.h; sourceTree = "<group>"; };
 		18627C7D29052BDF00BD2A04 /* SceneDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = SceneDelegate.m; sourceTree = "<group>"; };
 		18627C7F29052BDF00BD2A04 /* ViewController.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = ViewController.h; sourceTree = "<group>"; };
 		18627C8029052BDF00BD2A04 /* ViewController.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = ViewController.m; sourceTree = "<group>"; };
 		18627C8329052BDF00BD2A04 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/Main.storyboard; sourceTree = "<group>"; };
 		18627C8529052BE000BD2A04 /* Assets.xcassets */ = {isa = PBXFileReference; lastKnownFileType = folder.assetcatalog; path = Assets.xcassets; sourceTree = "<group>"; };
 		18627C8829052BE000BD2A04 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
 		18627C8A29052BE000BD2A04 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
 		18627C8B29052BE000BD2A04 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
 		18627C9229052C2B00BD2A04 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = whisper.h; path = ../../../whisper.h; sourceTree = "<group>"; };
 		18627C9329052C4900BD2A04 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = whisper.cpp; path = ../../../whisper.cpp; sourceTree = "<group>"; };
 		18627C9529052C5800BD2A04 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml.c; sourceTree = "<group>"; };
 		18627C9729052C6600BD2A04 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml.h; sourceTree = "<group>"; };
 		18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = "ggml-base.en.bin"; path = "../../../models/ggml-base.en.bin"; sourceTree = "<group>"; };
 /* End PBXFileReference section */
 /* Begin PBXFrameworksBuildPhase section */
 		18627C7329052BDF00BD2A04 /* Frameworks */ = {
 			isa = PBXFrameworksBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXFrameworksBuildPhase section */
 /* Begin PBXGroup section */
 		18627C6D29052BDF00BD2A04 = {
 			isa = PBXGroup;
 			children = (
 				18627C7829052BDF00BD2A04 /* whisper.objc */,
 				18627C7729052BDF00BD2A04 /* Products */,
 			);
 			sourceTree = "<group>";
 		};
 		18627C7729052BDF00BD2A04 /* Products */ = {
 			isa = PBXGroup;
 			children = (
 				18627C7629052BDF00BD2A04 /* whisper.objc.app */,
 			);
 			name = Products;
 			sourceTree = "<group>";
 		};
 		18627C7829052BDF00BD2A04 /* whisper.objc */ = {
 			isa = PBXGroup;
 			children = (
 				18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */,
 				18627C9729052C6600BD2A04 /* ggml.h */,
 				18627C9529052C5800BD2A04 /* ggml.c */,
 				18627C9329052C4900BD2A04 /* whisper.cpp */,
 				18627C9229052C2B00BD2A04 /* whisper.h */,
 				18627C7929052BDF00BD2A04 /* AppDelegate.h */,
 				18627C7A29052BDF00BD2A04 /* AppDelegate.m */,
 				18627C7C29052BDF00BD2A04 /* SceneDelegate.h */,
 				18627C7D29052BDF00BD2A04 /* SceneDelegate.m */,
 				18627C7F29052BDF00BD2A04 /* ViewController.h */,
 				18627C8029052BDF00BD2A04 /* ViewController.m */,
 				18627C8229052BDF00BD2A04 /* Main.storyboard */,
 				18627C8529052BE000BD2A04 /* Assets.xcassets */,
 				18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */,
 				18627C8A29052BE000BD2A04 /* Info.plist */,
 				18627C8B29052BE000BD2A04 /* main.m */,
 			);
 			path = whisper.objc;
 			sourceTree = "<group>";
 		};
 /* End PBXGroup section */
 /* Begin PBXNativeTarget section */
 		18627C7529052BDF00BD2A04 /* whisper.objc */ = {
 			isa = PBXNativeTarget;
 			buildConfigurationList = 18627C8F29052BE000BD2A04 /* Build configuration list for PBXNativeTarget "whisper.objc" */;
 			buildPhases = (
 				18627C7229052BDF00BD2A04 /* Sources */,
 				18627C7329052BDF00BD2A04 /* Frameworks */,
 				18627C7429052BDF00BD2A04 /* Resources */,
 			);
 			buildRules = (
 			);
 			dependencies = (
 			);
 			name = whisper.objc;
 			productName = whisper.objc;
 			productReference = 18627C7629052BDF00BD2A04 /* whisper.objc.app */;
 			productType = "com.apple.product-type.application";
 		};
 /* End PBXNativeTarget section */
 /* Begin PBXProject section */
 		18627C6E29052BDF00BD2A04 /* Project object */ = {
 			isa = PBXProject;
 			attributes = {
 				BuildIndependentTargetsInParallel = 1;
 				LastUpgradeCheck = 1400;
 				TargetAttributes = {
 					18627C7529052BDF00BD2A04 = {
 						CreatedOnToolsVersion = 14.0.1;
 					};
 				};
 			};
 			buildConfigurationList = 18627C7129052BDF00BD2A04 /* Build configuration list for PBXProject "whisper.objc" */;
 			compatibilityVersion = "Xcode 14.0";
 			developmentRegion = en;
 			hasScannedForEncodings = 0;
 			knownRegions = (
 				en,
 				Base,
 			);
 			mainGroup = 18627C6D29052BDF00BD2A04;
 			productRefGroup = 18627C7729052BDF00BD2A04 /* Products */;
 			projectDirPath = "";
 			projectRoot = "";
 			targets = (
 				18627C7529052BDF00BD2A04 /* whisper.objc */,
 			);
 		};
 /* End PBXProject section */
 /* Begin PBXResourcesBuildPhase section */
 		18627C7429052BDF00BD2A04 /* Resources */ = {
 			isa = PBXResourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
 				18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */,
 				18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */,
 				18627C8429052BDF00BD2A04 /* Main.storyboard in Resources */,
 				18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXResourcesBuildPhase section */
 /* Begin PBXSourcesBuildPhase section */
 		18627C7229052BDF00BD2A04 /* Sources */ = {
 			isa = PBXSourcesBuildPhase;
 			buildActionMask = 2147483647;
 			files = (
 				18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
 				18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
 				18627C9629052C5800BD2A04 /* ggml.c in Sources */,
 				18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
 				18627C8C29052BE000BD2A04 /* main.m in Sources */,
 				18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
 /* End PBXSourcesBuildPhase section */
 /* Begin PBXVariantGroup section */
 		18627C8229052BDF00BD2A04 /* Main.storyboard */ = {
 			isa = PBXVariantGroup;
 			children = (
 				18627C8329052BDF00BD2A04 /* Base */,
 			);
 			name = Main.storyboard;
 			sourceTree = "<group>";
 		};
 		18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */ = {
 			isa = PBXVariantGroup;
 			children = (
 				18627C8829052BE000BD2A04 /* Base */,
 			);
 			name = LaunchScreen.storyboard;
 			sourceTree = "<group>";
 		};
 /* End PBXVariantGroup section */
 /* Begin XCBuildConfiguration section */
 		18627C8D29052BE000BD2A04 /* Debug */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
 				CLANG_ENABLE_MODULES = YES;
 				CLANG_ENABLE_OBJC_ARC = YES;
 				CLANG_ENABLE_OBJC_WEAK = YES;
 				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
 				CLANG_WARN_BOOL_CONVERSION = YES;
 				CLANG_WARN_COMMA = YES;
 				CLANG_WARN_CONSTANT_CONVERSION = YES;
 				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
 				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
 				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
 				CLANG_WARN_EMPTY_BODY = YES;
 				CLANG_WARN_ENUM_CONVERSION = YES;
 				CLANG_WARN_INFINITE_RECURSION = YES;
 				CLANG_WARN_INT_CONVERSION = YES;
 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
 				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
 				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
 				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
 				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
 				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
 				CLANG_WARN_STRICT_PROTOTYPES = YES;
 				CLANG_WARN_SUSPICIOUS_MOVE = YES;
 				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
 				CLANG_WARN_UNREACHABLE_CODE = YES;
 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
 				COPY_PHASE_STRIP = NO;
 				DEBUG_INFORMATION_FORMAT = dwarf;
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
 				ENABLE_TESTABILITY = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu11;
 				GCC_DYNAMIC_NO_PIC = NO;
 				GCC_NO_COMMON_BLOCKS = YES;
 				GCC_OPTIMIZATION_LEVEL = 0;
 				GCC_PREPROCESSOR_DEFINITIONS = (
 					"DEBUG=1",
 					"$(inherited)",
 				);
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
 				GCC_WARN_UNDECLARED_SELECTOR = YES;
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
 				MTL_FAST_MATH = YES;
 				ONLY_ACTIVE_ARCH = YES;
 				SDKROOT = iphoneos;
 			};
 			name = Debug;
 		};
 		18627C8E29052BE000BD2A04 /* Release */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ALWAYS_SEARCH_USER_PATHS = NO;
 				CLANG_ANALYZER_NONNULL = YES;
 				CLANG_ANALYZER_NUMBER_OBJECT_CONVERSION = YES_AGGRESSIVE;
 				CLANG_CXX_LANGUAGE_STANDARD = "gnu++20";
 				CLANG_ENABLE_MODULES = YES;
 				CLANG_ENABLE_OBJC_ARC = YES;
 				CLANG_ENABLE_OBJC_WEAK = YES;
 				CLANG_WARN_BLOCK_CAPTURE_AUTORELEASING = YES;
 				CLANG_WARN_BOOL_CONVERSION = YES;
 				CLANG_WARN_COMMA = YES;
 				CLANG_WARN_CONSTANT_CONVERSION = YES;
 				CLANG_WARN_DEPRECATED_OBJC_IMPLEMENTATIONS = YES;
 				CLANG_WARN_DIRECT_OBJC_ISA_USAGE = YES_ERROR;
 				CLANG_WARN_DOCUMENTATION_COMMENTS = YES;
 				CLANG_WARN_EMPTY_BODY = YES;
 				CLANG_WARN_ENUM_CONVERSION = YES;
 				CLANG_WARN_INFINITE_RECURSION = YES;
 				CLANG_WARN_INT_CONVERSION = YES;
 				CLANG_WARN_NON_LITERAL_NULL_CONVERSION = YES;
 				CLANG_WARN_OBJC_IMPLICIT_RETAIN_SELF = YES;
 				CLANG_WARN_OBJC_LITERAL_CONVERSION = YES;
 				CLANG_WARN_OBJC_ROOT_CLASS = YES_ERROR;
 				CLANG_WARN_QUOTED_INCLUDE_IN_FRAMEWORK_HEADER = YES;
 				CLANG_WARN_RANGE_LOOP_ANALYSIS = YES;
 				CLANG_WARN_STRICT_PROTOTYPES = YES;
 				CLANG_WARN_SUSPICIOUS_MOVE = YES;
 				CLANG_WARN_UNGUARDED_AVAILABILITY = YES_AGGRESSIVE;
 				CLANG_WARN_UNREACHABLE_CODE = YES;
 				CLANG_WARN__DUPLICATE_METHOD_MATCH = YES;
 				COPY_PHASE_STRIP = NO;
 				DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
 				ENABLE_NS_ASSERTIONS = NO;
 				ENABLE_STRICT_OBJC_MSGSEND = YES;
 				GCC_C_LANGUAGE_STANDARD = gnu11;
 				GCC_NO_COMMON_BLOCKS = YES;
 				GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
 				GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR;
 				GCC_WARN_UNDECLARED_SELECTOR = YES;
 				GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
 				GCC_WARN_UNUSED_FUNCTION = YES;
 				GCC_WARN_UNUSED_VARIABLE = YES;
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
 			};
 			name = Release;
 		};
 		18627C9029052BE000BD2A04 /* Debug */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_TEAM = P8JZH34X63;
 				GENERATE_INFOPLIST_FILE = YES;
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
 				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
 				INFOPLIST_KEY_UIMainStoryboardFile = Main;
 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
 				MARKETING_VERSION = 1.0;
 				PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Debug;
 		};
 		18627C9129052BE000BD2A04 /* Release */ = {
 			isa = XCBuildConfiguration;
 			buildSettings = {
 				ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
 				ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
 				CODE_SIGN_STYLE = Automatic;
 				CURRENT_PROJECT_VERSION = 1;
 				DEVELOPMENT_TEAM = P8JZH34X63;
 				GENERATE_INFOPLIST_FILE = YES;
 				INFOPLIST_FILE = whisper.objc/Info.plist;
 				INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
 				INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
 				INFOPLIST_KEY_UIMainStoryboardFile = Main;
 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
 				INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
 				LD_RUNPATH_SEARCH_PATHS = (
 					"$(inherited)",
 					"@executable_path/Frameworks",
 				);
 				MARKETING_VERSION = 1.0;
 				PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SWIFT_EMIT_LOC_STRINGS = YES;
 				TARGETED_DEVICE_FAMILY = "1,2";
 			};
 			name = Release;
 		};
 /* End XCBuildConfiguration section */
 /* Begin XCConfigurationList section */
 		18627C7129052BDF00BD2A04 /* Build configuration list for PBXProject "whisper.objc" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
 				18627C8D29052BE000BD2A04 /* Debug */,
 				18627C8E29052BE000BD2A04 /* Release */,
 			);
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
 		18627C8F29052BE000BD2A04 /* Build configuration list for PBXNativeTarget "whisper.objc" */ = {
 			isa = XCConfigurationList;
 			buildConfigurations = (
 				18627C9029052BE000BD2A04 /* Debug */,
 				18627C9129052BE000BD2A04 /* Release */,
 			);
 			defaultConfigurationIsVisible = 0;
 			defaultConfigurationName = Release;
 		};
 /* End XCConfigurationList section */
 	};
 	rootObject = 18627C6E29052BDF00BD2A04 /* Project object */;
 }
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/contents.xcworkspacedata
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/contents.xcworkspacedata
@ -1,7 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <Workspace
   version = "1.0">
   <FileRef
      location = "self:">
   </FileRef>
 </Workspace>
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcshareddata/IDEWorkspaceChecks.plist
@ -1,8 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
 <dict>
 	<key>IDEDidComputeMac32BitWarning</key>
 	<true/>
 </dict>
 </plist>
--- a/examples/whisper.objc/whisper.objc/AppDelegate.h
+++ b/examples/whisper.objc/whisper.objc/AppDelegate.h
@ -1,14 +0,0 @@
 //
 //  AppDelegate.h
 //  whisper.objc
 //
 //  Created by Georgi Gerganov on 23.10.22.
 //
 #import <UIKit/UIKit.h>
@interface AppDelegate : UIResponder <UIApplicationDelegate>
@end
--- a/examples/whisper.objc/whisper.objc/AppDelegate.m
+++ b/examples/whisper.objc/whisper.objc/AppDelegate.m
@ -1,40 +0,0 @@
 //
 //  AppDelegate.m
 //  whisper.objc
 //
 //  Created by Georgi Gerganov on 23.10.22.
 //
 #import "AppDelegate.h"
@interface AppDelegate ()
@end
@implementation AppDelegate
 - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
    // Override point for customization after application launch.
    return YES;
 }
 #pragma mark - UISceneSession lifecycle
 - (UISceneConfiguration *)application:(UIApplication *)application configurationForConnectingSceneSession:(UISceneSession *)connectingSceneSession options:(UISceneConnectionOptions *)options {
    // Called when a new scene session is being created.
    // Use this method to select a configuration to create the new scene with.
    return [[UISceneConfiguration alloc] initWithName:@"Default Configuration" sessionRole:connectingSceneSession.role];
 }
 - (void)application:(UIApplication *)application didDiscardSceneSessions:(NSSet<UISceneSession *> *)sceneSessions {
    // Called when the user discards a scene session.
    // If any sessions were discarded while the application was not running, this will be called shortly after application:didFinishLaunchingWithOptions.
    // Use this method to release any resources that were specific to the discarded scenes, as they will not return.
 }
@end
--- a/examples/whisper.objc/whisper.objc/Assets.xcassets/AccentColor.colorset/Contents.json
+++ b/examples/whisper.objc/whisper.objc/Assets.xcassets/AccentColor.colorset/Contents.json
@ -1,11 +0,0 @@
 {
  "colors" : [
    {
      "idiom" : "universal"
    }
  ],
  "info" : {
    "author" : "xcode",
    "version" : 1
  }
 }
--- a/examples/whisper.objc/whisper.objc/Assets.xcassets/AppIcon.appiconset/Contents.json
+++ b/examples/whisper.objc/whisper.objc/Assets.xcassets/AppIcon.appiconset/Contents.json
@ -1,13 +0,0 @@
 {
  "images" : [
    {
      "idiom" : "universal",
      "platform" : "ios",
      "size" : "1024x1024"
    }
  ],
  "info" : {
    "author" : "xcode",
    "version" : 1
  }
 }
--- a/examples/whisper.objc/whisper.objc/Assets.xcassets/Contents.json
+++ b/examples/whisper.objc/whisper.objc/Assets.xcassets/Contents.json
@ -1,6 +0,0 @@
 {
  "info" : {
    "author" : "xcode",
    "version" : 1
  }
 }
--- a/examples/whisper.objc/whisper.objc/Base.lproj/LaunchScreen.storyboard
+++ b/examples/whisper.objc/whisper.objc/Base.lproj/LaunchScreen.storyboard
@ -1,25 +0,0 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="13122.16" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="01J-lp-oVM">
    <dependencies>
        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="13104.12"/>
        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
    </dependencies>
    <scenes>
        <!--View Controller-->
        <scene sceneID="EHf-IW-A2E">
            <objects>
                <viewController id="01J-lp-oVM" sceneMemberID="viewController">
                    <view key="view" contentMode="scaleToFill" id="Ze5-6b-2t3">
                        <rect key="frame" x="0.0" y="0.0" width="375" height="667"/>
                        <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
                        <color key="backgroundColor" xcode11CocoaTouchSystemColor="systemBackgroundColor" cocoaTouchSystemColor="whiteColor"/>
                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                    </view>
                </viewController>
                <placeholder placeholderIdentifier="IBFirstResponder" id="iYj-Kq-Ea1" userLabel="First Responder" sceneMemberID="firstResponder"/>
            </objects>
            <point key="canvasLocation" x="53" y="375"/>
        </scene>
    </scenes>
 </document>
--- a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
+++ b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
@ -1,89 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
    <device id="retina6_0" orientation="portrait" appearance="light"/>
    <dependencies>
        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
        <capability name="System colors in document resources" minToolsVersion="11.0"/>
        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
    </dependencies>
    <scenes>
        <!--View Controller-->
        <scene sceneID="tne-QT-ifu">
            <objects>
                <viewController id="BYZ-38-t0r" customClass="ViewController" sceneMemberID="viewController">
                    <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
                        <rect key="frame" x="0.0" y="0.0" width="390" height="844"/>
                        <autoresizingMask key="autoresizingMask" flexibleMinX="YES" widthSizable="YES" flexibleMinY="YES" heightSizable="YES"/>
                        <subviews>
                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="VOi-PT-Rbu">
                                <rect key="frame" x="35" y="121" width="156" height="49"/>
                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
                                <color key="tintColor" systemColor="opaqueSeparatorColor"/>
                                <state key="normal" title="Start Capturing">
                                    <color key="titleColor" systemColor="labelColor"/>
                                </state>
                                <connections>
                                    <action selector="toggleCapture:" destination="BYZ-38-t0r" eventType="touchUpInside" id="BuO-Wf-RgV"/>
                                </connections>
                            </button>
                            <label opaque="NO" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" fixedFrame="YES" text="Status: Idle" textAlignment="natural" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" adjustsFontSizeToFit="NO" translatesAutoresizingMaskIntoConstraints="NO" id="Tgu-2q-eHQ">
                                <rect key="frame" x="35" y="78" width="232" height="21"/>
                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                <fontDescription key="fontDescription" type="system" pointSize="17"/>
                                <nil key="textColor"/>
                                <nil key="highlightedColor"/>
                            </label>
                            <textView clipsSubviews="YES" multipleTouchEnabled="YES" contentMode="scaleToFill" fixedFrame="YES" text="Record some speech and press &quot;Transcribe&quot;. The result will be displayed here." textAlignment="natural" translatesAutoresizingMaskIntoConstraints="NO" id="mv2-KD-7jn">
                                <rect key="frame" x="35" y="248" width="320" height="300"/>
                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                <color key="backgroundColor" systemColor="systemBackgroundColor"/>
                                <color key="textColor" systemColor="labelColor"/>
                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                            </textView>
                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
                                <rect key="frame" x="35" y="191" width="156" height="49"/>
                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
                                <color key="tintColor" systemColor="opaqueSeparatorColor"/>
                                <state key="normal" title="Transcribe">
                                    <color key="titleColor" systemColor="labelColor"/>
                                </state>
                                <connections>
                                    <action selector="onTranscribe:" destination="BYZ-38-t0r" eventType="touchUpInside" id="ond-bx-48O"/>
                                    <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
                                </connections>
                            </button>
                        </subviews>
                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                        <color key="backgroundColor" systemColor="systemBackgroundColor"/>
                        <constraints>
                            <constraint firstItem="Brs-xi-o8i" firstAttribute="trailing" secondItem="VOi-PT-Rbu" secondAttribute="trailing" id="8mF-AW-cbc"/>
                        </constraints>
                    </view>
                    <connections>
                        <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
                        <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
                        <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
                        <outlet property="textviewResult" destination="mv2-KD-7jn" id="RBw-0L-iGj"/>
                    </connections>
                </viewController>
                <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
            </objects>
            <point key="canvasLocation" x="30.769230769230766" y="-28.436018957345969"/>
        </scene>
    </scenes>
    <resources>
        <systemColor name="labelColor">
            <color red="0.0" green="0.0" blue="0.0" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
        </systemColor>
        <systemColor name="opaqueSeparatorColor">
            <color red="0.77647058823529413" green="0.77647058823529413" blue="0.78431372549019607" alpha="1" colorSpace="custom" customColorSpace="sRGB"/>
        </systemColor>
        <systemColor name="systemBackgroundColor">
            <color white="1" alpha="1" colorSpace="custom" customColorSpace="genericGamma22GrayColorSpace"/>
        </systemColor>
    </resources>
 </document>
--- a/examples/whisper.objc/whisper.objc/Info.plist
+++ b/examples/whisper.objc/whisper.objc/Info.plist
@ -1,27 +0,0 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 <plist version="1.0">
 <dict>
 	<key>NSMicrophoneUsageDescription</key>
 	<string>This app requires microphone access in order to transcribe speech</string>
 	<key>UIApplicationSceneManifest</key>
 	<dict>
 		<key>UIApplicationSupportsMultipleScenes</key>
 		<false/>
 		<key>UISceneConfigurations</key>
 		<dict>
 			<key>UIWindowSceneSessionRoleApplication</key>
 			<array>
 				<dict>
 					<key>UISceneConfigurationName</key>
 					<string>Default Configuration</string>
 					<key>UISceneDelegateClassName</key>
 					<string>SceneDelegate</string>
 					<key>UISceneStoryboardFile</key>
 					<string>Main</string>
 				</dict>
 			</array>
 		</dict>
 	</dict>
 </dict>
 </plist>
--- a/examples/whisper.objc/whisper.objc/SceneDelegate.h
+++ b/examples/whisper.objc/whisper.objc/SceneDelegate.h
@ -1,15 +0,0 @@
 //
 //  SceneDelegate.h
 //  whisper.objc
 //
 //  Created by Georgi Gerganov on 23.10.22.
 //
 #import <UIKit/UIKit.h>
@interface SceneDelegate : UIResponder <UIWindowSceneDelegate>
@property (strong, nonatomic) UIWindow * window;
@end
--- a/examples/whisper.objc/whisper.objc/SceneDelegate.m
+++ b/examples/whisper.objc/whisper.objc/SceneDelegate.m
@ -1,57 +0,0 @@
 //
 //  SceneDelegate.m
 //  whisper.objc
 //
 //  Created by Georgi Gerganov on 23.10.22.
 //
 #import "SceneDelegate.h"
@interface SceneDelegate ()
@end
@implementation SceneDelegate
 - (void)scene:(UIScene *)scene willConnectToSession:(UISceneSession *)session options:(UISceneConnectionOptions *)connectionOptions {
    // Use this method to optionally configure and attach the UIWindow `window` to the provided UIWindowScene `scene`.
    // If using a storyboard, the `window` property will automatically be initialized and attached to the scene.
    // This delegate does not imply the connecting scene or session are new (see `application:configurationForConnectingSceneSession` instead).
 }
 - (void)sceneDidDisconnect:(UIScene *)scene {
    // Called as the scene is being released by the system.
    // This occurs shortly after the scene enters the background, or when its session is discarded.
    // Release any resources associated with this scene that can be re-created the next time the scene connects.
    // The scene may re-connect later, as its session was not necessarily discarded (see `application:didDiscardSceneSessions` instead).
 }
 - (void)sceneDidBecomeActive:(UIScene *)scene {
    // Called when the scene has moved from an inactive state to an active state.
    // Use this method to restart any tasks that were paused (or not yet started) when the scene was inactive.
 }
 - (void)sceneWillResignActive:(UIScene *)scene {
    // Called when the scene will move from an active state to an inactive state.
    // This may occur due to temporary interruptions (ex. an incoming phone call).
 }
 - (void)sceneWillEnterForeground:(UIScene *)scene {
    // Called as the scene transitions from the background to the foreground.
    // Use this method to undo the changes made on entering the background.
 }
 - (void)sceneDidEnterBackground:(UIScene *)scene {
    // Called as the scene transitions from the foreground to the background.
    // Use this method to save data, release shared resources, and store enough scene-specific state information
    // to restore the scene back to its current state.
 }
@end
--- a/examples/whisper.objc/whisper.objc/ViewController.h
+++ b/examples/whisper.objc/whisper.objc/ViewController.h
@ -1,41 +0,0 @@
 //
 //  ViewController.h
 //  whisper.objc
 //
 //  Created by Georgi Gerganov on 23.10.22.
 //
 #import <UIKit/UIKit.h>
 #import <AVFoundation/AVFoundation.h>
 #import <AudioToolbox/AudioQueue.h>
 #define NUM_BUFFERS 3
 #define MAX_AUDIO_SEC 30
 #define SAMPLE_RATE 16000
 struct whisper_context;
 typedef struct
 {
    int ggwaveId;
    bool isCapturing;
    UILabel * labelReceived;
    AudioQueueRef queue;
    AudioStreamBasicDescription dataFormat;
    AudioQueueBufferRef buffers[NUM_BUFFERS];
    int n_samples;
    int16_t * audioBufferI16;
    float   * audioBufferF32;
    struct whisper_context * ctx;
 } StateInp;
@interface ViewController : UIViewController
 {
    StateInp stateInp;
 }
@end
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -1,240 +0,0 @@
 //
 //  ViewController.m
 //  whisper.objc
 //
 //  Created by Georgi Gerganov on 23.10.22.
 //
 #import "ViewController.h"
 #import "whisper.h"
 #define NUM_BYTES_PER_BUFFER 16*1024
 // callback used to process captured audio
 void AudioInputCallback(void * inUserData,
                        AudioQueueRef inAQ,
                        AudioQueueBufferRef inBuffer,
                        const AudioTimeStamp * inStartTime,
                        UInt32 inNumberPacketDescriptions,
                        const AudioStreamPacketDescription * inPacketDescs);
@interface ViewController ()
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;
@end
@implementation ViewController
 - (void)setupAudioFormat:(AudioStreamBasicDescription*)format
 {
    format->mSampleRate       = 16000;
    format->mFormatID         = kAudioFormatLinearPCM;
    format->mFramesPerPacket  = 1;
    format->mChannelsPerFrame = 1;
    format->mBytesPerFrame    = 2;
    format->mBytesPerPacket   = 2;
    format->mBitsPerChannel   = 16;
    format->mReserved         = 0;
    format->mFormatFlags      = kLinearPCMFormatFlagIsSignedInteger;
 }
 - (void)viewDidLoad {
    [super viewDidLoad];
    // whisper.cpp initialization
    {
        // load the model
        NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"ggml-base.en" ofType:@"bin"];
        // check if the model exists
        if (![[NSFileManager defaultManager] fileExistsAtPath:modelPath]) {
            NSLog(@"Model file not found");
            return;
        }
        NSLog(@"Loading model from %@", modelPath);
        // create ggml context
        stateInp.ctx = whisper_init([modelPath UTF8String]);
        // check if the model was loaded successfully
        if (stateInp.ctx == NULL) {
            NSLog(@"Failed to load model");
            return;
        }
    }
    // initialize audio format and buffers
    {
        [self setupAudioFormat:&stateInp.dataFormat];
        stateInp.n_samples = 0;
        stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
        stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
    }
 }
 -(IBAction) stopCapturing {
    NSLog(@"Stop capturing");
    _labelStatusInp.text = @"Status: Idle";
    [_buttonToggleCapture setTitle:@"Start capturing" forState:UIControlStateNormal];
    [_buttonToggleCapture setBackgroundColor:[UIColor grayColor]];
    stateInp.isCapturing = false;
    AudioQueueStop(stateInp.queue, true);
    for (int i = 0; i < NUM_BUFFERS; i++) {
        AudioQueueFreeBuffer(stateInp.queue, stateInp.buffers[i]);
    }
    AudioQueueDispose(stateInp.queue, true);
 }
 - (IBAction)toggleCapture:(id)sender {
    if (stateInp.isCapturing) {
        // stop capturing
        [self stopCapturing];
        return;
    }
    // initiate audio capturing
    NSLog(@"Start capturing");
    stateInp.n_samples = 0;
    OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
                                         AudioInputCallback,
                                         &stateInp,
                                         CFRunLoopGetCurrent(),
                                         kCFRunLoopCommonModes,
                                         0,
                                         &stateInp.queue);
    if (status == 0) {
        for (int i = 0; i < NUM_BUFFERS; i++) {
            AudioQueueAllocateBuffer(stateInp.queue, NUM_BYTES_PER_BUFFER, &stateInp.buffers[i]);
            AudioQueueEnqueueBuffer (stateInp.queue, stateInp.buffers[i], 0, NULL);
        }
        stateInp.isCapturing = true;
        status = AudioQueueStart(stateInp.queue, NULL);
        if (status == 0) {
            _labelStatusInp.text = @"Status: Capturing";
            [sender setTitle:@"Stop Capturing" forState:UIControlStateNormal];
            [_buttonToggleCapture setBackgroundColor:[UIColor redColor]];
        }
    }
    if (status != 0) {
        [self stopCapturing];
    }
 }
 - (IBAction)onTranscribePrepare:(id)sender {
    _textviewResult.text = @"Processing - please wait ...";
    if (stateInp.isCapturing) {
        // stop capturing
        [self stopCapturing];
        return;
    }
 }
 - (IBAction)onTranscribe:(id)sender {
    NSLog(@"Processing %d samples", stateInp.n_samples);
    // process captured audio
    // convert I16 to F32
    for (int i = 0; i < stateInp.n_samples; i++) {
        stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
    }
    // run the model
    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    params.print_realtime       = true;
    params.print_progress       = false;
    params.print_timestamps     = true;
    params.print_special_tokens = false;
    params.translate            = false;
    params.language             = "en";
    params.n_threads            = 4;
    params.offset_ms            = 0;
    CFTimeInterval startTime = CACurrentMediaTime();
    if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
        NSLog(@"Failed to run the model");
        _textviewResult.text = @"Failed to run the model";
        return;
    }
    CFTimeInterval endTime = CACurrentMediaTime();
    // clear the text in the textview
    _textviewResult.text = @"";
    int n_segments = whisper_full_n_segments(stateInp.ctx);
    for (int i = 0; i < n_segments; i++) {
        const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
        // append the text to the textview
        _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
    }
    // internal model timing
    whisper_print_timings(stateInp.ctx);
    NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
    _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
 }
 //
 // Callback implmentation
 //
 void AudioInputCallback(void * inUserData,
                        AudioQueueRef inAQ,
                        AudioQueueBufferRef inBuffer,
                        const AudioTimeStamp * inStartTime,
                        UInt32 inNumberPacketDescriptions,
                        const AudioStreamPacketDescription * inPacketDescs)
 {
    StateInp * stateInp = (StateInp*)inUserData;
    if (!stateInp->isCapturing) {
        NSLog(@"Not capturing, ignoring audio");
        return;
    }
    const int n = inBuffer->mAudioDataByteSize / 2;
    NSLog(@"Captured %d new samples", n);
    if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
        NSLog(@"Too much audio data, ignoring");
        return;
    }
    for (int i = 0; i < n; i++) {
        stateInp->audioBufferI16[stateInp->n_samples + i] = ((short*)inBuffer->mAudioData)[i];
    }
    stateInp->n_samples += n;
    // put the buffer back in the queue
    AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
 }
@end
--- a/examples/whisper.objc/whisper.objc/main.m
+++ b/examples/whisper.objc/whisper.objc/main.m
@ -1,18 +0,0 @@
 //
 //  main.m
 //  whisper.objc
 //
 //  Created by Georgi Gerganov on 23.10.22.
 //
 #import <UIKit/UIKit.h>
 #import "AppDelegate.h"
 int main(int argc, char * argv[]) {
    NSString * appDelegateClassName;
    @autoreleasepool {
        // Setup code that might create autoreleased objects goes here.
        appDelegateClassName = NSStringFromClass([AppDelegate class]);
    }
    return UIApplicationMain(argc, argv, nil, appDelegateClassName);
 }
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -1,4 +0,0 @@
 set(TARGET whisper.wasm)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/index-tmpl.html        ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/index.html @ONLY)
 configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/whisper.js ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/${TARGET}/whisper.js  COPYONLY)
--- a/examples/whisper.wasm/README.md
+++ b/examples/whisper.wasm/README.md
@ -1,43 +0,0 @@
 # whisper.wasm
 Inference of [OpenAI's Whisper ASR model](https://github.com/openai/whisper) inside the browser
 This example uses a WebAssembly (WASM) port of the [whisper.cpp](https://github.com/ggerganov/whisper.cpp)
 implementation of the transformer to run the inference inside a web page. The audio data does not leave your computer -
 it is processed locally on your machine. The performance is not great but you should be able to achieve x2 or x3
 real-time for the `tiny` and `base` models on a modern CPU and browser (i.e. transcribe a 60 seconds audio in about
 ~20-30 seconds).
 This WASM port utilizes [WASM SIMD 128-bit intrinsics](https://emcc.zcopy.site/docs/porting/simd/) so you have to make
 sure that [your browser supports them](https://webassembly.org/roadmap/).
 The example is capable of running all models up to size `small` inclusive. Beyond that, the memory requirements and
 performance are unsatisfactory. The implementation currently support only the `Greedy` sampling strategy. Both
 transcription and translation are supported.
 Since the model data is quite big (74MB for the `tiny` model) you need to manually load the model into the web-page.
 The example supports both loading audio from a file and recording audio from the microphone. The maximum length of the
 audio is limited to 120 seconds.
 ## Live demo
 Link: https://whisper.ggerganov.com
 ![image](https://user-images.githubusercontent.com/1991296/197348344-1a7fead8-3dae-4922-8b06-df223a206603.png)
 ## Build instructions
 ```bash
 # build using Emscripten
 git clone https://github.com/ggerganov/whisper.cpp
 cd whisper.cpp
 mkdir build-em && cd build-em
 emcmake cmake ..
 make -j
 # copy the produced page to your HTTP path
 cp bin/whisper.wasm/index.html /path/to/html/
 cp bin/whisper.wasm/whisper.js /path/to/html/
 cp bin/libwhisper.worker.js    /path/to/html/
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -1,485 +0,0 @@
 <!doctype html>
 <html lang="en-us">
    <head>
        <title>whisper.cpp : WASM example</title>
        <style>
            #output {
                width: 100%;
                height: 100%;
                margin: 0 auto;
                margin-top: 10px;
                border-left: 0px;
                border-right: 0px;
                padding-left: 0px;
                padding-right: 0px;
                display: block;
                background-color: black;
                color: white;
                font-size: 10px;
                font-family: 'Lucida Console', Monaco, monospace;
                outline: none;
                white-space: pre;
                overflow-wrap: normal;
                overflow-x: scroll;
            }
        </style>
    </head>
    <body>
        <div id="main-container">
            <b>Minimal <a href="https://github.com/ggerganov/whisper.cpp">whisper.cpp</a> example running fully in the browser</b>
            <br><br>
            Usage instructions:<br>
            <ul>
                <li>Load a ggml model file (you can obtain one from <a href="https://ggml.ggerganov.com/">here</a>, recommended: <b>tiny</b> or <b>base</b>)</li>
                <li>Select audio file to transcribe or record audio from the microphone (sample: <a href="https://whisper.ggerganov.com/jfk.wav">jfk.wav</a>)</li>
                <li>Click on the "Transcribe" button to start the transcription</li>
            </ul>
            Note that the computation is quite heavy and may take a few seconds to complete.<br>
            The transcription results will be displayed in the text area below.<br><br>
            <b>Important: your browser must support WASM SIMD instructions for this to work.</b>
            <br><br><hr>
            <div id="model">
                Model:
                <input type="file" id="file" name="file" onchange="loadFile(event, 'ggml.bin')" />
            </div>
            <br>
            <!-- radio button to select between file upload or microphone -->
            <div id="input">
                Input:
                <input type="radio" id="file" name="input" value="file" checked="checked" onchange="changeInput('file')" /> File
                <input type="radio" id="mic" name="input" value="mic" onchange="changeInput('mic')" /> Microphone
            </div>
            <br>
            <div id="input_file">
                Audio file:
                <input type="file" id="file" name="file" onchange="loadAudio(event)" />
            </div>
            <div id="input_mic" style="display: none;">
                Microphone:
                <button id="start" onclick="startRecording()">Start</button>
                <button id="stop" onclick="stopRecording()" disabled>Stop</button>
                <!-- progress bar to show recording progress -->
                <br><br>
                <div id="progress" style="display: none;">
                    <div id="progress-bar" style="width: 0%; height: 10px; background-color: #4CAF50;"></div>
                    <div id="progress-text">0%</div>
                </div>
            </div>
            <audio controls="controls" id="audio" loop hidden>
                Your browser does not support the &lt;audio&gt; tag.
                <source id="source" src="" type="audio/wav" />
            </audio>
            <hr><br>
            <table>
                <tr>
                    <td>
                        Language:
                        <select id="language" name="language">
                            <option value="en">English</option>
                            <option value="ar">Arabic</option>
                            <option value="hy">Armenian</option>
                            <option value="az">Azerbaijani</option>
                            <option value="eu">Basque</option>
                            <option value="be">Belarusian</option>
                            <option value="bn">Bengali</option>
                            <option value="bg">Bulgarian</option>
                            <option value="ca">Catalan</option>
                            <option value="zh">Chinese</option>
                            <option value="hr">Croatian</option>
                            <option value="cs">Czech</option>
                            <option value="da">Danish</option>
                            <option value="nl">Dutch</option>
                            <option value="en">English</option>
                            <option value="et">Estonian</option>
                            <option value="tl">Filipino</option>
                            <option value="fi">Finnish</option>
                            <option value="fr">French</option>
                            <option value="gl">Galician</option>
                            <option value="ka">Georgian</option>
                            <option value="de">German</option>
                            <option value="el">Greek</option>
                            <option value="gu">Gujarati</option>
                            <option value="iw">Hebrew</option>
                            <option value="hi">Hindi</option>
                            <option value="hu">Hungarian</option>
                            <option value="is">Icelandic</option>
                            <option value="id">Indonesian</option>
                            <option value="ga">Irish</option>
                            <option value="it">Italian</option>
                            <option value="ja">Japanese</option>
                            <option value="kn">Kannada</option>
                            <option value="ko">Korean</option>
                            <option value="la">Latin</option>
                            <option value="lv">Latvian</option>
                            <option value="lt">Lithuanian</option>
                            <option value="mk">Macedonian</option>
                            <option value="ms">Malay</option>
                            <option value="mt">Maltese</option>
                            <option value="no">Norwegian</option>
                            <option value="fa">Persian</option>
                            <option value="pl">Polish</option>
                            <option value="pt">Portuguese</option>
                            <option value="ro">Romanian</option>
                            <option value="ru">Russian</option>
                            <option value="sr">Serbian</option>
                            <option value="sk">Slovak</option>
                            <option value="sl">Slovenian</option>
                            <option value="es">Spanish</option>
                            <option value="sw">Swahili</option>
                            <option value="sv">Swedish</option>
                            <option value="ta">Tamil</option>
                            <option value="te">Telugu</option>
                            <option value="th">Thai</option>
                            <option value="tr">Turkish</option>
                            <option value="uk">Ukrainian</option>
                            <option value="ur">Urdu</option>
                            <option value="vi">Vietnamese</option>
                            <option value="cy">Welsh</option>
                            <option value="yi">Yiddish</option>
                        </select>
                    </td>
                    <td>
                        <button onclick="onProcess(false);">Transcribe</button>
                    </td>
                    <td>
                        <button onclick="onProcess(true);">Translate</button>
                    </td>
                </tr>
            </table>
            <br>
            <!-- textarea with height filling the rest of the page -->
            <textarea id="output" rows="20"></textarea>
            <br><br>
            <div class="cell-version">
                <span>
                    |
                    Build time: <span class="nav-link">@GIT_DATE@</span> |
                    Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
                    Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
                    <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/whisper.wasm">Source Code</a> |
                </span>
            </div>
        </div>
        <script type='text/javascript'>
            // TODO: convert audio buffer to WAV
            function setAudio(audio) {
                //if (audio) {
                //    // convert to 16-bit PCM
                //    var blob = new Blob([audio], { type: 'audio/wav' });
                //    var url = URL.createObjectURL(blob);
                //    document.getElementById('source').src = url;
                //    document.getElementById('audio').hidden = false;
                //    document.getElementById('audio').loop = false;
                //    document.getElementById('audio').load();
                //} else {
                //    document.getElementById('audio').hidden = true;
                //}
            }
            function changeInput(input) {
                if (input == 'file') {
                    document.getElementById('input_file').style.display = 'block';
                    document.getElementById('input_mic').style.display = 'none';
                    document.getElementById('progress').style.display = 'none';
                } else {
                    document.getElementById('input_file').style.display = 'none';
                    document.getElementById('input_mic').style.display = 'block';
                    document.getElementById('progress').style.display = 'block';
                }
            }
            var printTextarea = (function() {
                    var element = document.getElementById('output');
                    if (element) element.alue = ''; // clear browser cache
                    return function(text) {
                        if (arguments.length > 1) text = Array.prototype.slice.call(arguments).join(' ');
                        console.log(text);
                        if (element) {
                            element.value += text + "\n";
                            element.scrollTop = element.scrollHeight; // focus on bottom
                        }
                    };
                })();
            var Module = {
                print: printTextarea,
                printErr: printTextarea,
                setStatus: function(text) {
                    printTextarea('js: ' + text);
                },
                monitorRunDependencies: function(left) {
                }
            };
            const kMaxAudio_s = 120;
            const kSampleRate = 16000;
            window.AudioContext = window.AudioContext || window.webkitAudioContext;
            window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
            // web audio context
            var context = null;
            // audio data
            var audio = null;
            // the whisper instance
            var instance = null;
            var model_fname = '';
            // helper function
            function convertTypedArray(src, type) {
                var buffer = new ArrayBuffer(src.byteLength);
                var baseView = new src.constructor(buffer).set(src);
                return new type(buffer);
            }
            //
            // load model
            //
            function loadFile(event, fname) {
                var file = event.target.files[0] || null;
                if (file == null) {
                    return;
                }
                printTextarea("js: loading model: " + file.name + ", size: " + file.size + " bytes");
                printTextarea('js: please wait ...');
                var reader = new FileReader();
                reader.onload = function(event) {
                    var buf = new Uint8Array(reader.result);
                    // write to WASM file using whisper.FS_createDataFile
                    // if the file exists, delete it
                    try {
                        Module.FS_unlink(fname);
                    } catch (e) {
                    }
                    Module.FS_createDataFile("/", fname, buf, true, true);
                    model_fname = file.name;
                    printTextarea('js: loaded model: ' + model_fname + ' size: ' + buf.length);
                }
                reader.readAsArrayBuffer(file);
            }
            //
            // audio file
            //
            function loadAudio(event) {
                if (!context) {
                    context = new AudioContext({sampleRate: 16000});
                }
                var file = event.target.files[0] || null;
                if (file == null) {
                    return;
                }
                printTextarea('js: loading audio: ' + file.name + ', size: ' + file.size + ' bytes');
                printTextarea('js: please wait ...');
                var reader = new FileReader();
                reader.onload = function(event) {
                    var buf = new Uint8Array(reader.result);
                    context.decodeAudioData(buf.buffer, function(audioBuffer) {
                        var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
                        var source = offlineContext.createBufferSource();
                        source.buffer = audioBuffer;
                        source.connect(offlineContext.destination);
                        source.start(0);
                        offlineContext.startRendering().then(function(renderedBuffer) {
                            audio = renderedBuffer.getChannelData(0);
                            printTextarea('js: audio loaded, size: ' + audio.length);
                            // truncate to first 30 seconds
                            if (audio.length > kMaxAudio_s*kSampleRate) {
                                audio = audio.slice(0, kMaxAudio_s*kSampleRate);
                                printTextarea('js: truncated audio to first ' + kMaxAudio_s + ' seconds');
                            }
                            setAudio(audio);
                        });
                    }, function(e) {
                        printTextarea('js: error decoding audio: ' + e);
                        audio = null;
                        setAudio(audio);
                    });
                }
                reader.readAsArrayBuffer(file);
            }
            //
            // microphone
            //
            var mediaRecorder = null;
            var doRecording = false;
            var startTime = 0;
            function stopRecording() {
                doRecording = false;
            }
            // record up to kMaxAudio_s seconds of audio from the microphone
            // check if doRecording is false every 1000 ms and stop recording if so
            // update progress information
            function startRecording() {
                if (!context) {
                    context = new AudioContext({sampleRate: 16000});
                }
                document.getElementById('start').disabled = true;
                document.getElementById('stop').disabled = false;
                document.getElementById('progress-bar').style.width = '0%';
                document.getElementById('progress-text').innerHTML = '0%';
                doRecording = true;
                startTime = Date.now();
                var chunks = [];
                var stream = null;
                navigator.mediaDevices.getUserMedia({audio: true, video: false})
                    .then(function(s) {
                        stream = s;
                        mediaRecorder = new MediaRecorder(stream);
                        mediaRecorder.ondataavailable = function(e) {
                            chunks.push(e.data);
                        };
                        mediaRecorder.onstop = function(e) {
                            var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
                            chunks = [];
                            document.getElementById('start').disabled = false;
                            document.getElementById('stop').disabled = true;
                            var reader = new FileReader();
                            reader.onload = function(event) {
                                var buf = new Uint8Array(reader.result);
                                context.decodeAudioData(buf.buffer, function(audioBuffer) {
                                    var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
                                    var source = offlineContext.createBufferSource();
                                    source.buffer = audioBuffer;
                                    source.connect(offlineContext.destination);
                                    source.start(0);
                                    offlineContext.startRendering().then(function(renderedBuffer) {
                                        audio = renderedBuffer.getChannelData(0);
                                        printTextarea('js: audio recorded, size: ' + audio.length);
                                        // truncate to first 30 seconds
                                        if (audio.length > kMaxAudio_s*kSampleRate) {
                                            audio = audio.slice(0, kMaxAudio_s*kSampleRate);
                                            printTextarea('js: truncated audio to first ' + kMaxAudio_s + ' seconds');
                                        }
                                        setAudio(audio);
                                    });
                                }, function(e) {
                                    printTextarea('js: error decoding audio: ' + e);
                                    audio = null;
                                    setAudio(audio);
                                });
                            }
                            reader.readAsArrayBuffer(blob);
                        };
                        mediaRecorder.start();
                    })
                    .catch(function(err) {
                        printTextarea('js: error getting audio stream: ' + err);
                    });
                var interval = setInterval(function() {
                    if (!doRecording) {
                        clearInterval(interval);
                        mediaRecorder.stop();
                        stream.getTracks().forEach(function(track) {
                            track.stop();
                        });
                    }
                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxAudio_s) + '%';
                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxAudio_s).toFixed(0) + '%';
                }, 1000);
                printTextarea('js: recording ...');
                setTimeout(function() {
                    if (doRecording) {
                        printTextarea('js: recording stopped after ' + kMaxAudio_s + ' seconds');
                        stopRecording();
                    }
                }, kMaxAudio_s*1000);
            }
            //
            // transcribe
            //
            function onProcess(translate) {
                if (!instance) {
                    instance = Module.init('ggml.bin');
                    if (instance) {
                        printTextarea("js: whisper initialized, instance: " + instance);
                        document.getElementById('model').innerHTML = 'Model loaded: ' + model_fname;
                    }
                }
                if (!instance) {
                    printTextarea("js: failed to initialize whisper");
                    return;
                }
                if (!audio) {
                    printTextarea("js: no audio data");
                    return;
                }
                if (instance) {
                    printTextarea('');
                    printTextarea('js: processing - this might take a while ...');
                    printTextarea('js: the page will be unresponsive until the processing is completed');
                    printTextarea('');
                    setTimeout(function() {
                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, translate);
                        console.log('js: full_default returned: ' + ret);
                        if (ret) {
                            printTextarea("js: whisper returned: " + ret);
                        }
                    }, 100);
                }
            }
        </script>
        <script type="text/javascript" src="whisper.js"></script>
    </body>
 </html>
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@ -1,53 +0,0 @@
 #!/bin/bash
 # Helper script to run the bench tool on all models and print the results in share-able format
 printf "Usage: ./bench.sh [n_threads]\n"
 if [ -z "$1" ]; then
    n_threads=4
 else
    n_threads=$1
 fi
 models=( "tiny" "base" "small" "medium" "large" )
 printf "\n"
 printf "Running benchmark for all models\n"
 printf "This can take a while!\n"
 printf "\n"
 printf "| CPU | OS | Config | Model | Threads | Load [ms] | Encode [ms] |\n"
 printf "| --- | -- | ------ | ----- | ------- | --------- | ----------- |\n"
 for model in "${models[@]}"; do
    # run once to heat-up the cache
    ./bench -m ./models/ggml-$model.bin -t $n_threads 2>/dev/null 1>/dev/null
    # actual run
    # store stderr output in a variable in order to parse it later
    output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
    # parse the output:
    load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
    encode_time=$(echo "$output" | grep "encode time" | awk '{print $5}')
    system_info=$(echo "$output" | grep "system_info")
    n_threads=$(echo "$output" | grep "system_info" | awk '{print $4}')
    config=""
    if [[ $system_info == *"AVX2 = 1"* ]]; then
        config="$config AVX2"
    fi
    if [[ $system_info == *"NEON = 1"* ]]; then
        config="$config NEON"
    fi
    if [[ $system_info == *"BLAS = 1"* ]]; then
        config="$config BLAS"
    fi
    printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time |\n"
 done
--- a/extra/convert-all.sh
+++ b/extra/convert-all.sh
@ -1,8 +0,0 @@
 #!/bin/bash
 models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" )
 for model in "${models[@]}"; do
    python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
    mv -v models/ggml-model.bin models/ggml-$model.bin
 done
--- a/extra/sha-all.sh
+++ b/extra/sha-all.sh
@ -1,7 +0,0 @@
 #!/bin/bash
 # Compute the SHA1 of all model files in ./models/ggml-*.bin
 for f in ./models/ggml-*.bin; do
    shasum "$f" -a 1
 done
--- a/ggml-mtl.h
+++ b/ggml-mtl.h
@ -1,38 +0,0 @@
 #pragma once
 #include <stdint.h>
 #include <stddef.h>
 // TODO: this will hold dynamic context data in the future
 //       currently unused
 struct ggml_mtl_context {
    void * dummy;
 };
 struct ggml_mtl_object {
    int32_t id;
    void * data;
 };
 struct ggml_mtl_context * ggml_mtl_init(void);
 struct ggml_mtl_object ggml_mtl_alloc(size_t size);
 // multiply matrix by vector
 void ggml_mtl_mul_mat_vec_f16(
    struct ggml_mtl_context * ctx,
    struct ggml_mtl_object    src0,  // matrix f16
    const __fp16            * src1,  // vector f16
    float                   * dst,   // vector f32
    int                       nrows,
    int                       ncols);
 // multiply matrix by matrix
 void ggml_mtl_mul_mat_f16(
    struct ggml_mtl_context * ctx,
    struct ggml_mtl_object    src0,  // matrix f16
    const __fp16            * src1,  // matrix f16
    float                   * dst,   // matrix f32
    int                       nrows0,
    int                       nrows1,
    int                       ncols);
--- a/ggml-mtl.m
+++ b/ggml-mtl.m
@ -1,162 +0,0 @@
 #import "ggml-mtl.h"
 #import <Foundation/Foundation.h>
 #import <Metal/Metal.h>
 #import <MetalPerformanceShaders/MetalPerformanceShaders.h>
 #define GGML_MTL_MAX_BUFFERS 256
 // global static storage for Metal buffers
 // TODO: move this into a dynamic context
 static id<MTLBuffer> g_buffers[GGML_MTL_MAX_BUFFERS];
 // global MTL context
 // TODO: move this into a dynamic context
 static id<MTLDevice>       g_device;
 static id<MTLCommandQueue> g_command_queue;
 struct ggml_mtl_context * ggml_mtl_init() {
    // TODO: implement properly
    //       for now, init the global MTL context and MTL buffers
    g_device = MTLCreateSystemDefaultDevice();
    g_command_queue = [g_device newCommandQueue];
    if (g_command_queue == nil)
    {
        NSLog(@"Failed to find the command queue.");
        return nil;
    }
    return nil;
 }
 // search for unallocated buffer slot and use it
 struct ggml_mtl_object ggml_mtl_alloc(size_t size) {
    // TODO: temporarily making sure that the buffers are nil at the start
    static bool first = true;
    if (first) {
        for (int i = 0; i < GGML_MTL_MAX_BUFFERS; ++i) {
            assert(g_buffers[i] == nil);
        }
        first = false;
    }
    struct ggml_mtl_object obj = { -1, nil };
    for (int i = 0; i < GGML_MTL_MAX_BUFFERS; i++) {
        if (g_buffers[i] == nil) {
            g_buffers[i] = [g_device newBufferWithLength:size options:MTLResourceStorageModeManaged];
            // lunk the MTL buffer to the ggml object
            obj.id = i;
            obj.data = [g_buffers[i] contents];
            break;
        }
    }
    return obj;
 }
 struct params_mul_mat_vec {
    int  N; // rows
    int  M; // cols
 };
 // multiply matrix with a vector using MPSMatrixVectorMultiplication
 void ggml_mtl_mul_mat_vec_f16(
        struct ggml_mtl_context * ctx,
        struct ggml_mtl_object    src0,
        const __fp16            * src1,
        float                   * dst,
        int                       nrows,
        int                       ncols) {
    (void) ctx; // unused
    // Create a command buffer to hold commands.
    id<MTLCommandBuffer> commandBuffer = [g_command_queue commandBuffer];
    assert(commandBuffer != nil);
    // make managed device buffer to store src1
    id<MTLBuffer> src1_buffer = [g_device newBufferWithBytes:src1 length:ncols*sizeof(__fp16) options:MTLResourceStorageModeManaged];
    id<MTLBuffer> dst_buffer  = [g_device newBufferWithLength:nrows*sizeof(float) options:MTLResourceStorageModeManaged];
    // MPSMatrixDescriptor
    MPSMatrixDescriptor *src0_desc = [MPSMatrixDescriptor matrixDescriptorWithRows:nrows columns:ncols rowBytes:ncols*sizeof(__fp16) dataType:MPSDataTypeFloat16];
    MPSVectorDescriptor *src1_desc = [MPSVectorDescriptor vectorDescriptorWithLength:ncols dataType:MPSDataTypeFloat16];
    MPSVectorDescriptor *dst_desc  = [MPSVectorDescriptor vectorDescriptorWithLength:nrows dataType:MPSDataTypeFloat32];
    // MPSMatrix
    MPSMatrix *src0_mat = [[MPSMatrix alloc] initWithBuffer:g_buffers[src0.id] descriptor:src0_desc];
    MPSVector *src1_vec = [[MPSVector alloc] initWithBuffer:src1_buffer        descriptor:src1_desc];
    MPSVector *dst_vec  = [[MPSVector alloc] initWithBuffer:dst_buffer         descriptor:dst_desc];
    // MPSMatrixVectorMultiplication
    MPSMatrixVectorMultiplication *mul_mat_vec = [[MPSMatrixVectorMultiplication alloc] initWithDevice:g_device transpose:NO rows:nrows columns:ncols alpha:1.0 beta:0.0];
    // encode
    [mul_mat_vec encodeToCommandBuffer:commandBuffer
                           inputMatrix:src0_mat
                           inputVector:src1_vec
                          resultVector:dst_vec];
    [commandBuffer commit];
    [commandBuffer waitUntilCompleted];
    // copy GPU result to CPU
    memcpy(dst, [dst_buffer contents], nrows*sizeof(float));
 }
 // multiply matrix with a matrix using MPSMatrixMultiplication
 void ggml_mtl_mul_mat_f16(
        struct ggml_mtl_context * ctx,
        struct ggml_mtl_object    src0,
        const __fp16            * src1,
        float                   * dst,
        int                       nrows0,
        int                       nrows1,
        int                       ncols) {
    (void) ctx; // unused
    // Create a command buffer to hold commands.
    id<MTLCommandBuffer> commandBuffer = [g_command_queue commandBuffer];
    assert(commandBuffer != nil);
    // make managed device buffer to store src1
    id<MTLBuffer> src1_buffer = [g_device newBufferWithBytes:src1 length:ncols*nrows1*sizeof(__fp16) options:MTLResourceStorageModeManaged];
    id<MTLBuffer> dst_buffer  = [g_device newBufferWithLength:nrows0*nrows1*sizeof(float) options:MTLResourceStorageModeManaged];
    // MPSMatrixDescriptor
    MPSMatrixDescriptor *src0_desc = [MPSMatrixDescriptor matrixDescriptorWithRows:nrows0 columns:ncols  rowBytes:ncols*sizeof(__fp16) dataType:MPSDataTypeFloat16];
    MPSMatrixDescriptor *src1_desc = [MPSMatrixDescriptor matrixDescriptorWithRows:nrows1 columns:ncols  rowBytes:ncols*sizeof(__fp16) dataType:MPSDataTypeFloat16];
    MPSMatrixDescriptor *dst_desc  = [MPSMatrixDescriptor matrixDescriptorWithRows:nrows1 columns:nrows0 rowBytes:nrows0*sizeof(float) dataType:MPSDataTypeFloat32];
    // MPSMatrix
    MPSMatrix *src0_mat = [[MPSMatrix alloc] initWithBuffer:g_buffers[src0.id] descriptor:src0_desc];
    MPSMatrix *src1_mat = [[MPSMatrix alloc] initWithBuffer:src1_buffer        descriptor:src1_desc];
    MPSMatrix *dst_mat  = [[MPSMatrix alloc] initWithBuffer:dst_buffer         descriptor:dst_desc];
    //// MPSMatrixMultiplication z = x * yT
    //MPSMatrixMultiplication *mul_mat = [[MPSMatrixMultiplication alloc] initWithDevice:g_device transposeLeft:NO transposeRight:YES resultRows:nrows resultColumns:nrows interiorColumns:ncols alpha:1.0 beta:0.0];
    //// encode
    //[mul_mat encodeToCommandBuffer:commandBuffer
    //                   leftMatrix:src0_mat
    //                  rightMatrix:src1_mat
    //                 resultMatrix:dst_mat];
    // MPSMatrixMultiplication zT = xT * y
    MPSMatrixMultiplication *mul_mat = [[MPSMatrixMultiplication alloc] initWithDevice:g_device transposeLeft:NO transposeRight:YES resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols alpha:1.0 beta:0.0];
    // encode
    [mul_mat encodeToCommandBuffer:commandBuffer
                       leftMatrix:src1_mat
                      rightMatrix:src0_mat
                     resultMatrix:dst_mat];
    [commandBuffer commit];
    [commandBuffer waitUntilCompleted];
    // copy GPU result to CPU
    memcpy(dst, [dst_buffer contents], nrows0*nrows1*sizeof(float));
 }
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -11,7 +11,7 @@ extern "C" {
 #define GGML_MAX_DIMS     4
 #define GGML_MAX_NODES    4096
 #define GGML_MAX_PARAMS   16
-#define GGML_MAX_CONTEXTS 64
+#define GGML_MAX_CONTEXTS 16
 #define GGML_MAX_OPT      4
 #ifdef __ARM_NEON
@ -108,8 +108,7 @@ struct ggml_tensor {
    int64_t perf_time_us;
    void * data;
-    int32_t id; // TODO: mtl buffer id
+    char padding[8];
    char pad[4];
 };
 // computation graph
@ -137,7 +136,6 @@ struct ggml_init_params {
    void * mem_buffer; // if NULL, memory will be allocated internally
 };
 void ggml_time_init(void);
 int64_t ggml_time_ms(void);
 int64_t ggml_time_us(void);
 int64_t ggml_cycles(void);
@ -174,12 +172,6 @@ struct ggml_tensor * ggml_new_tensor_2d(
        int    ne0,
        int    ne1);
 struct ggml_tensor * ggml_new_tensor_2d_mtl(
        struct ggml_context * ctx,
        enum   ggml_type type,
        int    ne0,
        int    ne1);
 struct ggml_tensor * ggml_new_tensor_3d(
        struct ggml_context * ctx,
        enum   ggml_type type,
@ -555,17 +547,6 @@ enum ggml_opt_result ggml_opt(
        struct ggml_opt_params params,
        struct ggml_tensor * f);
 //
 // system info
 //
 int ggml_cpu_has_avx2(void);
 int ggml_cpu_has_avx512(void);
 int ggml_cpu_has_neon(void);
 int ggml_cpu_has_fp16_va(void);
 int ggml_cpu_has_wasm_simd(void);
 int ggml_cpu_has_blas(void);
 #ifdef  __cplusplus
 }
 #endif
--- a/main.cpp
+++ b/main.cpp
@ -0,0 +1,243 @@
 #include "whisper.h"
 // third-party utilities
 // use your favorite implementations
 #define DR_WAV_IMPLEMENTATION
 #include "dr_wav.h"
 #include <cstdio>
 #include <string>
 #include <thread>
 #include <vector>
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
 std::string to_timestamp(int64_t t) {
    int64_t sec = t/100;
    int64_t msec = t - sec*100;
    int64_t min = sec/60;
    sec = sec - min*60;
    char buf[32];
    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
    return std::string(buf);
 }
 // command-line parameters
 struct whisper_params {
    int32_t seed      = -1; // RNG seed, not used currently
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t offset_ms = 0;
    bool verbose              = false;
    bool translate            = false;
    bool print_special_tokens = false;
    bool no_timestamps        = false;
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
    std::vector<std::string> fname_inp = {};
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
        if (arg[0] != '-') {
            params.fname_inp.push_back(arg);
            continue;
        }
        if (arg == "-s" || arg == "--seed") {
            params.seed = std::stoi(argv[++i]);
        } else if (arg == "-t" || arg == "--threads") {
            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "-o" || arg == "--offset") {
            params.offset_ms = std::stoi(argv[++i]);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else if (arg == "--translate") {
            params.translate = true;
        } else if (arg == "-l" || arg == "--language") {
            params.language = argv[++i];
            if (whisper_lang_id(params.language.c_str()) == -1) {
                fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
                whisper_print_usage(argc, argv, params);
                exit(0);
            }
        } else if (arg == "-ps" || arg == "--print_special") {
            params.print_special_tokens = true;
        } else if (arg == "-nt" || arg == "--no_timestamps") {
            params.no_timestamps = true;
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-f" || arg == "--file") {
            params.fname_inp.push_back(argv[++i]);
        } else if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }
    return true;
 }
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help           show this help message and exit\n");
    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -o N,     --offset N       offset in milliseconds (default: %d)\n", params.offset_ms);
    fprintf(stderr, "  -v,       --verbose        verbose output\n");
    fprintf(stderr, "            --translate      translate from source language to english\n");
    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME     input WAV file path\n");
    fprintf(stderr, "\n");
 }
 int main(int argc, char ** argv) {
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }
    if (params.seed < 0) {
        params.seed = time(NULL);
    }
    if (params.fname_inp.empty()) {
        fprintf(stderr, "error: no input files specified\n");
        whisper_print_usage(argc, argv, params);
        return 1;
    }
    // whisper init
    struct whisper_context * ctx = whisper_init(params.model.c_str());
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
        // WAV input
        std::vector<float> pcmf32;
        {
            drwav wav;
            if (!drwav_init_file(&wav, fname_inp.c_str(), NULL)) {
                fprintf(stderr, "%s: failed to open WAV file '%s' - check your input\n", argv[0], fname_inp.c_str());
                whisper_print_usage(argc, argv, {});
                return 2;
            }
            if (wav.channels != 1 && wav.channels != 2) {
                fprintf(stderr, "%s: WAV file '%s' must be mono or stereo\n", argv[0], fname_inp.c_str());
                return 3;
            }
            if (wav.sampleRate != WHISPER_SAMPLE_RATE) {
                fprintf(stderr, "%s: WAV file '%s' must be 16 kHz\n", argv[0], fname_inp.c_str());
                return 4;
            }
            if (wav.bitsPerSample != 16) {
                fprintf(stderr, "%s: WAV file '%s' must be 16-bit\n", argv[0], fname_inp.c_str());
                return 5;
            }
            int n = wav.totalPCMFrameCount;
            std::vector<int16_t> pcm16;
            pcm16.resize(n*wav.channels);
            drwav_read_pcm_frames_s16(&wav, n, pcm16.data());
            drwav_uninit(&wav);
            // convert to mono, float
            pcmf32.resize(n);
            if (wav.channels == 1) {
                for (int i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[i])/32768.0f;
                }
            } else {
                for (int i = 0; i < n; i++) {
                    pcmf32[i] = float(pcm16[2*i] + pcm16[2*i + 1])/65536.0f;
                }
            }
        }
        // print some info about the processing
        {
            printf("\n");
            if (!whisper_is_multilingual(ctx)) {
                if (params.language != "en" || params.translate) {
                    params.language = "en";
                    params.translate = false;
                    printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
                }
            }
            printf("%s: processing '%s' (%d samples, %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads,
                    params.language.c_str(),
                    params.translate ? "translate" : "transcribe",
                    params.no_timestamps ? 0 : 1);
            printf("\n");
        }
        // run the inference
        {
            whisper_full_params wparams = whisper_full_default_params(WHISPER_DECODE_GREEDY);
            wparams.print_realtime       = true;
            wparams.print_progress       = false;
            wparams.print_timestamps     = !params.no_timestamps;
            wparams.print_special_tokens = params.print_special_tokens;
            wparams.translate            = params.translate;
            wparams.language             = params.language.c_str();
            wparams.n_threads            = params.n_threads;
            wparams.offset_ms            = params.offset_ms;
            if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                return 6;
            }
            // print result;
            if (!wparams.print_realtime) {
                printf("\n");
                const int n_segments = whisper_full_n_segments(ctx);
                for (int i = 0; i < n_segments; ++i) {
                    const char * text = whisper_full_get_segment_text(ctx, i);
                    if (params.no_timestamps) {
                        printf ("%s", text);
                        fflush(stdout);
                    } else {
                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
                        printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
                    }
                }
            }
        }
    }
    whisper_print_timings(ctx);
    whisper_free(ctx);
    return 0;
 }
--- a/models/README.md
+++ b/models/README.md
@ -4,7 +4,7 @@ The [original Whisper PyTorch models provided by OpenAI](https://github.com/open
 have been converted to custom `ggml` format in order to be able to load them in C/C++. The conversion has been performed using the
 [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either obtain the original models and generate the `ggml` files
 yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh) script to download the
-already converted models from https://ggml.ggerganov.com
+already converted models.
 Sample usage:
@ -22,20 +22,6 @@ A third option to obtain the model files is to download them from Hugging Face:
 https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
 ## Available models
 | Model     | Disk   | Mem     | SHA                                        |
 | ---       | ---    | ---     | ---                                        |
 | tiny      |  75 MB | ~390 MB | `bd577a113a864445d4c299885e0cb97d4ba92b5f` |
 | tiny.en   |  75 MB | ~390 MB | `c78c86eb1a8faa21b369bcd33207cc90d64ae9df` |
 | base      | 142 MB | ~500 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
 | base.en   | 142 MB | ~500 MB | `137c40403d78fd54d454da0f9bd998f78703390c` |
 | small     | 466 MB | ~1.0 GB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
 | small.en  | 466 MB | ~1.0 GB | `db8a495a91d927739e50b3fc1cc4c6b8f6c2d022` |
 | medium    | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
 | large     | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
 ## Model files for testing purposes
 The model files pefixed with `for-tests-` are empty (i.e. do not contain any weights) and are used by the CI for testing purposes.
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -1,63 +0,0 @@
@echo off
 pushd %~dp0
 set models_path=%CD%
 popd
 set argc=0
 for %%x in (%*) do set /A argc+=1
 set models=tiny.en tiny base.en base small.en small medium.en medium large
 if %argc% neq 1 (
  echo.
  echo Usage: download-ggml-model.cmd model
  CALL :list_models
  goto :eof
 )
 set model=%1
 for %%b in (%models%) do ( 
  if "%%b"=="%model%" (
    CALL :download_model
    goto :eof
  )
 )
 echo Invalid model: %model%
 CALL :list_models
 goto :eof
 :download_model
 echo Downloading ggml model %model%...
 cd %models_path%
 if exist "ggml-%model%.bin" (
  echo Model %model% already exists. Skipping download.
  goto :eof
 )
 PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://ggml.ggerganov.com/ggml-model-whisper-%model%.bin -OutFile ggml-%model%.bin"
 if %ERRORLEVEL% neq 0 ( 
  echo Failed to download ggml model %model%
  echo Please try again later or download the original Whisper model files and convert them yourself.
  goto :eof
 )
 echo Done! Model %model% saved in %models_path%\models\ggml-%model%.bin
 echo You can now use it like this:
 echo main.exe -m %models_path%\models\ggml-%model%.bin -f %models_path%\samples\jfk.wav
 goto :eof
 :list_models
  echo.
  echo Available models:
  (for %%a in (%models%) do ( 
    echo %%a 
  ))
  echo.
  exit /b
--- a/samples/README.md
+++ b/samples/README.md
@ -1,6 +0,0 @@
 # Audio samples
 This folder contains various audio files used for testing.
 If you want to quickly get some more samples, simply run `make samples`. This will download several public audio files and convert them to appropriate 16-bit WAV format using `ffmpeg`
 https://github.com/ggerganov/whisper.cpp/blob/a09ce6e8899198015729ffc49ae10f67370906b1/Makefile#L104-L123
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -17,7 +17,6 @@
 #include <string>
 #include <thread>
 #include <vector>
 #include <fstream>
 //  500 -> 00:05.000
 // 6000 -> 01:00.000
@ -38,8 +37,6 @@ struct whisper_params {
    int32_t seed      = -1; // RNG seed, not used currently
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t step_ms   = 3000;
    int32_t length_ms  = 10000;
    int32_t capture_id = -1;
    bool verbose              = false;
    bool translate            = false;
@ -49,7 +46,7 @@ struct whisper_params {
    std::string language  = "en";
    std::string model     = "models/ggml-base.en.bin";
-    std::string fname_out = "";
+    std::string fname_inp = "samples/jfk.wav";
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -64,10 +61,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "--step") {
            params.step_ms = std::stoi(argv[++i]);
        } else if (arg == "--length") {
            params.length_ms = std::stoi(argv[++i]);
        } else if (arg == "-c" || arg == "--capture") {
            params.capture_id = std::stoi(argv[++i]);
        } else if (arg == "-v" || arg == "--verbose") {
            params.verbose = true;
        } else if (arg == "--translate") {
@ -88,7 +81,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-f" || arg == "--file") {
-            params.fname_out = argv[++i];
+            params.fname_inp = argv[++i];
        } else if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
@ -111,16 +104,14 @@ void whisper_print_usage(int argc, char ** argv, const whisper_params & params)
    fprintf(stderr, "  -s SEED,  --seed SEED      RNG seed (default: -1)\n");
    fprintf(stderr, "  -t N,     --threads N      number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "            --step N         audio step size in milliseconds (default: %d)\n", params.step_ms);
    fprintf(stderr, "            --length N       audio length in milliseconds (default: %d)\n", params.length_ms);
    fprintf(stderr, "  -c ID,    --capture ID     capture device ID (default: -1)\n");
    fprintf(stderr, "  -v,       --verbose        verbose output\n");
    fprintf(stderr, "            --translate      translate from source language to english\n");
-    fprintf(stderr, "  -kc,      --keep-context   keep text context from earlier audio (default: false)\n");
+    fprintf(stderr, "  -nc,      --no-context     disable context from earlier audio (default: false)\n");
    fprintf(stderr, "  -ps,      --print_special  print special tokens\n");
    fprintf(stderr, "  -nt,      --no_timestamps  do not print timestamps\n");
    fprintf(stderr, "  -l LANG,  --language LANG  spoken language (default: %s)\n", params.language.c_str());
    fprintf(stderr, "  -m FNAME, --model FNAME    model path (default: %s)\n", params.model.c_str());
-    fprintf(stderr, "  -f FNAME, --file FNAME     text output file name (default: no output to file)\n");
+    fprintf(stderr, "  -f FNAME, --file FNAME     input WAV file path (default: %s)\n", params.fname_inp.c_str());
    fprintf(stderr, "\n");
 }
@ -148,9 +139,9 @@ bool audio_sdl_init(const int capture_id) {
        {
            int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
-            fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+            printf("%s: found %d capture devices:\n", __func__, nDevices);
            for (int i = 0; i < nDevices; i++) {
-                fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+                printf("%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
            }
        }
    }
@ -168,21 +159,21 @@ bool audio_sdl_init(const int capture_id) {
        capture_spec_requested.samples  = 1024;
        if (capture_id >= 0) {
-            fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+            printf("%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
            g_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
        } else {
-            fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+            printf("%s: attempt to open default capture device ...\n", __func__);
            g_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
        }
        if (!g_dev_id_in) {
-            fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+            printf("%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
            g_dev_id_in = 0;
        } else {
-            fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, g_dev_id_in);
+            printf("%s: obtained spec for input device (SDL Id = %d):\n", __func__, g_dev_id_in);
-            fprintf(stderr, "%s:     - sample rate:       %d\n", __func__, capture_spec_obtained.freq);
+            printf("%s:     - sample rate:       %d\n", __func__, capture_spec_obtained.freq);
-            fprintf(stderr, "%s:     - format:            %d (required: %d)\n", __func__, capture_spec_obtained.format, capture_spec_requested.format);
+            printf("%s:     - format:            %d (required: %d)\n", __func__, capture_spec_obtained.format, capture_spec_requested.format);
-            fprintf(stderr, "%s:     - channels:          %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
+            printf("%s:     - channels:          %d (required: %d)\n", __func__, capture_spec_obtained.channels, capture_spec_requested.channels);
-            fprintf(stderr, "%s:     - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
+            printf("%s:     - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
        }
    }
@ -205,7 +196,7 @@ int main(int argc, char ** argv) {
    // init audio
-    if (!audio_sdl_init(params.capture_id)) {
+    if (!audio_sdl_init(-1)) {
        fprintf(stderr, "%s: audio_sdl_init() failed!\n", __func__);
        return 1;
    }
@ -215,55 +206,32 @@ int main(int argc, char ** argv) {
    struct whisper_context * ctx = whisper_init(params.model.c_str());
    const int n_samples = (params.step_ms/1000.0)*WHISPER_SAMPLE_RATE;
    const int n_samples_len = (params.length_ms/1000.0)*WHISPER_SAMPLE_RATE;
    const int n_samples_30s = 30*WHISPER_SAMPLE_RATE;
    std::vector<float> pcmf32(n_samples_30s, 0.0f);
    std::vector<float> pcmf32_old;
    const int n_new_line = params.length_ms / params.step_ms - 1;
    // print some info about the processing
    {
-        fprintf(stderr, "\n");
+        printf("\n");
        if (!whisper_is_multilingual(ctx)) {
            if (params.language != "en" || params.translate) {
                params.language = "en";
                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+                printf("%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
            }
        }
-        fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+        printf("%s: processing %d samples (%.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
-                __func__,
+                __func__, n_samples, float(n_samples)/WHISPER_SAMPLE_RATE, params.n_threads,
                n_samples,
                float(n_samples)/WHISPER_SAMPLE_RATE,
                float(n_samples_len)/WHISPER_SAMPLE_RATE,
                params.n_threads,
                params.language.c_str(),
                params.translate ? "translate" : "transcribe",
                params.no_timestamps ? 0 : 1);
-
+        printf("\n");
        fprintf(stderr, "%s: n_new_line = %d\n", __func__, n_new_line);
        fprintf(stderr, "\n");
    }
    SDL_PauseAudioDevice(g_dev_id_in, 0);
    int n_iter = 0;
    bool is_running = true;
    std::ofstream fout;
    if (params.fname_out.length() > 0) {
        fout.open(params.fname_out);
        if (!fout.is_open()) {
            fprintf(stderr, "%s: failed to open output file '%s'!\n", __func__, params.fname_out.c_str());
            return 1;
        }
    }
    printf("[Start speaking]");
    fflush(stdout);
    // main audio loop
    while (is_running) {
        // process SDL events:
@ -271,35 +239,22 @@ int main(int argc, char ** argv) {
        while (SDL_PollEvent(&event)) {
            switch (event.type) {
                case SDL_QUIT:
                    {
                    is_running = false;
-                    } break;
+                    break;
                default:
                    break;
            }
        }
-        if (!is_running) {
+        // process 3 seconds of new audio
            break;
        }
        // process new audio
        if (n_iter > 0 && SDL_GetQueuedAudioSize(g_dev_id_in) > 2*n_samples*sizeof(float)) {
            fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
            SDL_ClearQueuedAudio(g_dev_id_in);
        }
        while (SDL_GetQueuedAudioSize(g_dev_id_in) < n_samples*sizeof(float)) {
            SDL_Delay(1);
        }
        const int n_samples_new = SDL_GetQueuedAudioSize(g_dev_id_in)/sizeof(float);
        // take one second from previous iteration
-        //const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new));
+        // TODO: better strategy
-
+        const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_30s/30 - n_samples_new));
        // take up to params.length_ms audio from previous iteration
        const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_len - n_samples_new));
        //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
@ -315,7 +270,7 @@ int main(int argc, char ** argv) {
        // run the inference
        {
-            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+            whisper_full_params wparams = whisper_full_default_params(WHISPER_DECODE_GREEDY);
            wparams.print_progress       = false;
            wparams.print_special_tokens = params.print_special_tokens;
@ -333,48 +288,23 @@ int main(int argc, char ** argv) {
            // print result;
            {
-                printf("\33[2K\r");
+                printf("\n");
                // print long empty line to clear the previous line
                printf("%s", std::string(100, ' ').c_str());
                printf("\33[2K\r");
                const int n_segments = whisper_full_n_segments(ctx);
                for (int i = 0; i < n_segments; ++i) {
                    const char * text = whisper_full_get_segment_text(ctx, i);
                    if (params.no_timestamps) {
-                        printf("%s", text);
+                        printf ("%s", text);
                        fflush(stdout);
                        if (params.fname_out.length() > 0) {
                            fout << text;
                        }
                    } else {
                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
                        printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
                        if (params.fname_out.length() > 0) {
                            fout << "[" << to_timestamp(t0) << " --> " << to_timestamp(t1) << "]  " << text << std::endl;
                    }
                }
            }
                if (params.fname_out.length() > 0) {
                    fout << std::endl;
                }
            }
            ++n_iter;
            if ((n_iter % n_new_line) == 0) {
                printf("\n");
                pcmf32_old.clear();
            }
        }
    }
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,7 +1,3 @@
 if (EMSCRIPTEN)
    return()
 endif()
 set(TEST_TARGET test-main-tiny)
 add_test(NAME ${TEST_TARGET}
    COMMAND $<TARGET_FILE:main>
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -2,7 +2,6 @@
 #define WHISPER_H
 #include <stdint.h>
 #include <stdbool.h>
 #ifdef WHISPER_SHARED
 #    ifdef _WIN32
@ -31,99 +30,34 @@ extern "C" {
    //
    // C interface
    //
-    // The following interface is thread-safe as long as the sample whisper_context is not used by multiple threads
+
-    // concurrently.
+    // TODO: documentation will come soon
    //
    // Basic usage:
    //
    //     #include "whisper.h"
    //
    //     ...
    //
    //     struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
    //
    //     if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
    //         fprintf(stderr, "failed to process audio\n");
    //         return 7;
    //     }
    //
    //     const int n_segments = whisper_full_n_segments(ctx);
    //     for (int i = 0; i < n_segments; ++i) {
    //         const char * text = whisper_full_get_segment_text(ctx, i);
    //         printf("%s", text);
    //     }
    //
    //     whisper_free(ctx);
    //
    //     ...
    //
    // This is a demonstration of the most straightforward usage of the library.
    // "pcmf32" contains the RAW audio data in 32-bit floating point format.
    //
    // The interface also allows for more fine-grained control over the computation, but it requires a deeper
    // understanding of how the model works.
    //
    struct whisper_context;
    typedef int whisper_token;
    typedef struct whisper_token_data {
        whisper_token id;  // token id
        whisper_token tid; // forced timestamp token id
        float p;     // probability of the token
        float pt;    // probability of the timestamp token
        float ptsum; // sum of probabilities of all timestamp tokens
        // token-level timestamp data
        // do not use if you haven't computed token-level timestamps
        int64_t t0; // start time of the token
        int64_t t1; //   end time of the token
        float vlen; // voice length of the token
    } whisper_token_data;
    // Allocates all memory needed for the model and loads the model from the given file.
    // Returns NULL on failure.
    WHISPER_API struct whisper_context * whisper_init(const char * path_model);
    // Frees all memory allocated by the model.
    WHISPER_API void whisper_free(struct whisper_context * ctx);
    // Convert RAW PCM audio to log mel spectrogram.
    // The resulting spectrogram is stored inside the provided whisper context.
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel(
            struct whisper_context * ctx,
            const float * samples,
            int n_samples,
            int n_threads);
    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
    // n_mel must be 80
    // Returns 0 on success
    WHISPER_API int whisper_set_mel(
            struct whisper_context * ctx,
            const float * data,
            int n_len,
            int n_mel);
    // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
    // offset can be used to specify the offset of the first frame in the spectrogram.
    // Returns 0 on success
    WHISPER_API int whisper_encode(
            struct whisper_context * ctx,
            int offset,
            int n_threads);
    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
    // Make sure to call whisper_encode() first.
    // tokens + n_tokens is the provided context for the decoder.
    // n_past is the number of tokens to use from previous decoder calls.
    // Returns 0 on success
    WHISPER_API int whisper_decode(
            struct whisper_context * ctx,
            const whisper_token * tokens,
@ -131,15 +65,10 @@ extern "C" {
            int n_past,
            int n_threads);
-    // Token sampling methods.
+    WHISPER_API whisper_token whisper_sample_best(struct whisper_context * ctx, bool need_timestamp);
    // These are provided for convenience and can be used after each call to whisper_decode().
    // You can also implement your own sampling method using the whisper_get_probs() function.
    // whisper_sample_best() returns the token with the highest probability
    // whisper_sample_timestamp() returns the most probable timestamp token
    WHISPER_API whisper_token_data whisper_sample_best(struct whisper_context * ctx);
    WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx);
-    // Return the id of the specified language, returns -1 if not found
+    // return the id of the specified language, returns -1 if not found
    WHISPER_API int whisper_lang_id(const char * lang);
    WHISPER_API int whisper_n_len          (struct whisper_context * ctx); // mel length
@ -147,13 +76,10 @@ extern "C" {
    WHISPER_API int whisper_n_text_ctx     (struct whisper_context * ctx);
    WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
    // The probabilities for the next token
    WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);
    // Token Id -> String. Uses the vocabulary in the provided context
    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
    // Special tokens
    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
    WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
    WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
@ -161,33 +87,23 @@ extern "C" {
    WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
    WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
    // Task tokens
    WHISPER_API whisper_token whisper_token_translate ();
    WHISPER_API whisper_token whisper_token_transcribe();
    // Performance information
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
    ////////////////////////////////////////////////////////////////////////////
-    // Available sampling strategies
+    enum whisper_decode_strategy {
-    enum whisper_sampling_strategy {
+        WHISPER_DECODE_GREEDY,
-        WHISPER_SAMPLING_GREEDY,      // Always select the most probable token
+        WHISPER_DECODE_BEAM_SEARCH,
        WHISPER_SAMPLING_BEAM_SEARCH, // TODO: not implemented yet!
    };
    // Text segment callback
    // Called on every newly generated text segment
    // Use the whisper_full_...() functions to obtain the text segments
    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
    struct whisper_full_params {
-        enum whisper_sampling_strategy strategy;
+        enum whisper_decode_strategy strategy;
        int n_threads;
-        int n_max_text_ctx;
+        int offset_ms;
        int offset_ms;      // start offset in ms
        int duration_ms;    // audio duration to process in ms
        bool translate;
        bool no_context;
@ -196,14 +112,9 @@ extern "C" {
        bool print_realtime;
        bool print_timestamps;
        // [EXPERIMENTAL] token-level timestamps
        bool  token_timestamps; // enable token-level timestamps
        float thold_pt;         // timestamp token probability threshold (~0.01)
        float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
        int   max_len;          // max segment length in characters
        const char * language;
        union {
            struct {
                int n_past;
            } greedy;
@ -213,59 +124,25 @@ extern "C" {
                int beam_width;
                int n_best;
            } beam_search;
-
+        };
        whisper_new_segment_callback new_segment_callback;
        void * new_segment_callback_user_data;
    };
-    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
+    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_decode_strategy strategy);
-    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
+    // full whisper run - encode + decode
    // Uses the specified decoding strategy to obtain the text.
    WHISPER_API int whisper_full(
            struct whisper_context * ctx,
            struct whisper_full_params params,
            const float * samples,
            int n_samples);
    // Split the input audio in chunks and process each chunk separately using whisper_full()
    // It seems this approach can offer some speedup in some cases.
    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
    WHISPER_API int whisper_full_parallel(
            struct whisper_context * ctx,
            struct whisper_full_params params,
            const float * samples,
            int n_samples,
            const int n_processors);
    // Number of generated text segments.
    // A segment can be a few words, a sentence, or even a paragraph.
    WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
    // Get the start and end time of the specified segment.
    WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
    WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
    // Get the text of the specified segment.
    WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
    // Get number of tokens in the specified segment.
    WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);
    // Get the token text of the specified token in the specified segment.
    WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
    WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
    // Get token data for the specified token in the specified segment.
    // This contains probabilities, timestamps, etc.
    WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
    // Get the probability of the specified token in the specified segment.
    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
    // Print system information
    WHISPER_API const char * whisper_print_system_info();
 #ifdef __cplusplus
 }
 #endif
Author	SHA1	Message	Date
Georgi Gerganov	4597c9c19b	wip : try to compress just mlp	2022-10-08 15:12:15 +03:00
Georgi Gerganov	4a4a754220	wip : experimenting	2022-10-08 14:08:43 +03:00