release : v1.4.0

examples : fix + refactor Levenshtein distance
whisper : add integer quantization support (#540 )
2025-07-04 16:30:58 +02:00 · 2023-04-30 19:23:37 +03:00 · 2023-04-30 19:12:49 +03:00 · 2023-04-30 18:51:57 +03:00 · 2023-04-30 12:14:33 +03:00 · 2023-04-29 21:37:23 +03:00
56 changed files with 7756 additions and 3115 deletions
--- a/.gitignore
+++ b/.gitignore
@ -12,6 +12,7 @@ build-em/
 build-debug/
 build-release/
 build-static/
 build-cublas/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
@ -22,6 +23,7 @@ build-sanitize-thread/
 /talk
 /talk-llama
 /bench
 /quantize
 arm_neon.h
 sync.sh
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.0)
-project(whisper.cpp VERSION 1.3.0)
+project(whisper.cpp VERSION 1.4.0)
 if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
    add_compile_options(/utf-8)
@ -51,7 +51,7 @@ option(WHISPER_SANITIZE_UNDEFINED      "whisper: enable undefined sanitizer" OFF
 option(WHISPER_BUILD_TESTS            "whisper: build tests"    ${WHISPER_STANDALONE})
 option(WHISPER_BUILD_EXAMPLES         "whisper: build examples" ${WHISPER_STANDALONE})
-option(WHISPER_SUPPORT_SDL2            "whisper: support for libSDL2" OFF)
+option(WHISPER_SDL2                   "whisper: support for libSDL2" OFF)
 if (APPLE)
    option(WHISPER_NO_ACCELERATE         "whisper: disable Accelerate framework" OFF)
@ -60,8 +60,10 @@ if (APPLE)
    option(WHISPER_NO_FMA                "whisper: disable FMA" OFF)
    option(WHISPER_COREML                "whisper: enable Core ML framework" OFF)
    option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
 else()
-    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
+    option(WHISPER_OPENBLAS              "whisper: support for OpenBLAS" OFF)
    option(WHISPER_CUBLAS                "whisper: support for cuBLAS" OFF)
 endif()
 option(WHISPER_PERF "whisper: enable perf timings" OFF)
@ -119,10 +121,14 @@ if (APPLE)
        else()
            message(WARNING "CoreML framework not found")
        endif()
        if (WHISPER_COREML_ALLOW_FALLBACK)
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML_ALLOW_FALLBACK)
        endif()
    endif()
 endif()
-if (WHISPER_SUPPORT_OPENBLAS)
+if (WHISPER_OPENBLAS)
    find_library(OPENBLAS_LIB
        NAMES openblas libopenblas
        )
@ -136,6 +142,31 @@ if (WHISPER_SUPPORT_OPENBLAS)
    endif()
 endif()
 if (WHISPER_CUBLAS)
    cmake_minimum_required(VERSION 3.17)
    find_package(CUDAToolkit)
    if (CUDAToolkit_FOUND)
        message(STATUS "cuBLAS found")
        enable_language(CUDA)
        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
        add_compile_definitions(GGML_USE_CUBLAS)
        if (WHISPER_STATIC)
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
        else()
            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
        endif()
    else()
        message(WARNING "cuBLAS not found")
    endif()
 endif()
 # compiler flags
 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@ -242,6 +273,7 @@ set(TARGET whisper)
 add_library(${TARGET}
    ggml.h
    ggml.c
    ${GGML_CUDA_SOURCES}
    whisper.h
    whisper.cpp
    )
@ -271,7 +303,19 @@ if (BUILD_SHARED_LIBS)
    target_compile_definitions(${TARGET} PUBLIC
        WHISPER_SHARED
        GGML_SHARED
        )
    target_compile_definitions(${TARGET} PRIVATE
        WHISPER_BUILD
        GGML_BUILD
        )
 endif()
 if (GGML_CUDA_SOURCES)
    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
    set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
 endif()
 if (EMSCRIPTEN)
--- a/48
+++ b/48
@ -1,3 +1,5 @@
 default: main bench quantize
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@ -123,6 +125,7 @@ endif
 ifeq ($(UNAME_M),amd64)
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
@ -133,6 +136,7 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
 		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif
 ifndef WHISPER_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework
 	ifeq ($(UNAME_S),Darwin)
@ -140,26 +144,48 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
 ifdef WHISPER_COREML
 	CXXFLAGS += -DWHISPER_USE_COREML
 	LDFLAGS  += -framework Foundation -framework CoreML
 ifdef WHISPER_COREML_ALLOW_FALLBACK
 	CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK
 endif
 endif
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
 endif
 ifdef WHISPER_CUBLAS
 	CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
 	LDFLAGS     += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
 	WHISPER_OBJ += ggml-cuda.o
 	NVCC        = nvcc
 	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=native
 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
 endif
 ifdef WHISPER_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	CFLAGS += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# 32-bit Raspberry Pi 1, 2, 3
 	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access
 endif
 ifneq ($(filter armv7%,$(UNAME_M)),)
 	# 32-bit ARM, for example on Armbian or possibly raspbian
 	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
@ -167,6 +193,7 @@ ifneq ($(filter armv7%,$(UNAME_M)),)
 	# 64-bit ARM, use these (TODO: auto-detect 64-bit)
 	# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 4
 	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
@ -187,20 +214,18 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )
 default: main bench
 #
 # Build library
 #
-ggml.o: ggml.c ggml.h
+ggml.o: ggml.c ggml.h ggml-cuda.h
-	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
+	$(CC)  $(CFLAGS)   -c $< -o $@
-whisper.o: whisper.cpp whisper.h ggml.h
+whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
-	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 ifndef WHISPER_COREML
-WHISPER_OBJ = whisper.o
+WHISPER_OBJ += whisper.o
 else
 whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
 	$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
@ -208,7 +233,7 @@ whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
 whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
 	$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
-WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
+WHISPER_OBJ += whisper.o whisper-encoder.o whisper-encoder-impl.o
 endif
 libwhisper.a: ggml.o $(WHISPER_OBJ)
@ -218,7 +243,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
 clean:
-	rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
 #
 # Examples
@ -226,7 +251,7 @@ clean:
 CC_SDL=`sdl2-config --cflags --libs`
-SRC_COMMON = examples/common.cpp
+SRC_COMMON     = examples/common.cpp examples/common-ggml.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp
 main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
@ -236,6 +261,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
 bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
 quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o quantize $(LDFLAGS)
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
--- a/README.md
+++ b/README.md
@ -1,10 +1,12 @@
 # whisper.cpp
 ![whisper.cpp](https://user-images.githubusercontent.com/1991296/235238348-05d0f6a4-da44-4900-a1de-d0707e75b763.jpeg)
 [![Actions Status](https://github.com/ggerganov/whisper.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/whisper.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
-Beta: [v1.3.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.3.0) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Beta: [v1.4.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.0) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -13,9 +15,11 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
 - [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
 - Low memory usage (Flash Attention)
 - Zero memory allocations at runtime
 - Runs on the CPU
 - [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
 Supported platforms:
@ -225,6 +229,22 @@ make large
 | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | large  | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 ## Quantization
 `whisper.cpp` supports integer quantization of the Whisper `ggml` models.
 Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.
 Here are the steps for creating and using a quantized model:
 ```bash
 # quantize a model with Q5_0 method
 make quantize
 ./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
 # run the examples as usual, specifying the quantized model file
 ./main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
 ```
 ## Core ML support
 On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
@ -279,10 +299,23 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
 For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
 ## NVIDIA GPU support via cuBLAS
 With NVIDIA cards, the Encoder processing can be offloaded to the GPU to a large extend through cuBLAS.
 First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
 Now build `whisper.cpp` with cuBLAS support:
 ```
 make clean
 WHISPER_CUBLAS=1 make -j
 ```
 Run all the examples as usual.
 ## Limitations
 - Inference only
 - No GPU support (yet)
 ## Another example
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.3.0",
+  "version": "1.4.0",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -4,7 +4,7 @@ find_package(Threads REQUIRED)
 # third-party
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # SDL2
    find_package(SDL2 REQUIRED)
@ -21,13 +21,17 @@ set(TARGET common)
 add_library(${TARGET} STATIC
    common.h
    common.cpp
    common-ggml.h
    common-ggml.cpp
    )
 include(DefaultTargetOptions)
 target_link_libraries(${TARGET} PRIVATE whisper)
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # common-sdl
    set(TARGET common-sdl)
@ -62,6 +66,7 @@ else()
    add_subdirectory(stream)
    add_subdirectory(command)
    add_subdirectory(bench)
    add_subdirectory(quantize)
    add_subdirectory(talk)
    add_subdirectory(talk-llama)
 endif()
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -18,5 +18,6 @@ describe("Run whisper.node", () => {
        let result = await whisperAsync(whisperParamsMock);
        expect(result.length).toBeGreaterThan(0);
    }, 10000);
 });
-});
+
--- a/examples/bench.wasm/CMakeLists.txt
+++ b/examples/bench.wasm/CMakeLists.txt
@ -31,9 +31,9 @@ endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
+    -s PTHREAD_POOL_SIZE_STRICT=0 \
-    -s INITIAL_MEMORY=1024MB \
+    -s INITIAL_MEMORY=2000MB \
-    -s TOTAL_MEMORY=1024MB \
+    -s TOTAL_MEMORY=2000MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/bench.wasm/index-tmpl.html
+++ b/examples/bench.wasm/index-tmpl.html
@ -35,6 +35,15 @@
            <br><br>
            <b>More examples:</b>
                <a href="https://whisper.ggerganov.com/">main</a> |
                <a href="https://whisper.ggerganov.com/bench">bench</a> |
                <a href="https://whisper.ggerganov.com/stream">stream</a> |
                <a href="https://whisper.ggerganov.com/command">command</a> |
                <a href="https://whisper.ggerganov.com/talk">talk</a> |
            <br><br>
            <hr>
            Select the model you would like to use and click the "Bench" button.<br>
@ -46,9 +55,16 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
-                <span id="fetch-whisper-progress"></span>
+                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <button id="fetch-whisper-small-en-q5_1"  onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
                <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
                <button id="fetch-whisper-large-q5_0"     onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
                <span id="fetch-whisper-progress"></span>
            </div>
            <br>
@ -160,6 +176,14 @@
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
                document.getElementById('fetch-whisper-small-en').style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
                document.getElementById('whisper-file'         ).style.display = 'none';
                document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
            }
@ -168,11 +192,25 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                    'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
                    'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
                    'large-q5_0':    'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
                };
                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
                    'small.en': 466,
                    'tiny-en-q5_1':   31,
                    'base-en-q5_1':   57,
                    'small-en-q5_1':  182,
                    'medium-en-q5_0': 515,
                    'large-q5_0':     1030,
                };
                let url     = urls[model];
@ -181,6 +219,15 @@
                document.getElementById('fetch-whisper-tiny-en').style.display  = 'none';
                document.getElementById('fetch-whisper-base-en').style.display  = 'none';
                document.getElementById('fetch-whisper-small-en').style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
                document.getElementById('whisper-file'        ).style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
@ -192,6 +239,15 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en');  if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en');  if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'  ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q5_1'  ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-large-q5_0'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('whisper-file'        ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
                };
--- a/examples/command.wasm/emscripten.cpp
+++ b/examples/command.wasm/emscripten.cpp
@ -28,31 +28,6 @@ std::string g_transcribed   = "";
 std::vector<float> g_pcmf32;
 // compute similarity between two strings using Levenshtein distance
 static float similarity(const std::string & s0, const std::string & s1) {
    const size_t len0 = s0.size() + 1;
    const size_t len1 = s1.size() + 1;
    std::vector<int> col(len1, 0);
    std::vector<int> prevCol(len1, 0);
    for (size_t i = 0; i < len1; i++) {
        prevCol[i] = i;
    }
    for (size_t i = 0; i < len0; i++) {
        col[0] = i;
        for (size_t j = 1; j < len1; j++) {
            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
        }
        col.swap(prevCol);
    }
    const float dist = prevCol[len1 - 1];
    return 1.0f - (dist / std::max(s0.size(), s1.size()));
 }
 void command_set_status(const std::string & status) {
    std::lock_guard<std::mutex> lock(g_mutex);
    g_status = status;
--- a/examples/command.wasm/index-tmpl.html
+++ b/examples/command.wasm/index-tmpl.html
@ -35,6 +35,15 @@
            <br><br>
            <b>More examples:</b>
                <a href="https://whisper.ggerganov.com/">main</a> |
                <a href="https://whisper.ggerganov.com/bench">bench</a> |
                <a href="https://whisper.ggerganov.com/stream">stream</a> |
                <a href="https://whisper.ggerganov.com/command">command</a> |
                <a href="https://whisper.ggerganov.com/talk">talk</a> |
            <br><br>
            <hr>
            Select the model you would like to use, click the "Start" button and follow the instructions.
@ -45,6 +54,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <!--
@ -162,11 +175,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };
                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
                    'tiny-en-q5_1':   31,
                    'base-en-q5_1':   57,
                };
                let url     = urls[model];
@ -177,6 +196,10 @@
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
@ -188,6 +211,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };
--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@ -1,4 +1,4 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # command
    set(TARGET command)
    add_executable(${TARGET} command.cpp)
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -163,31 +163,6 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
    return result;
 }
 // compute similarity between two strings using Levenshtein distance
 float similarity(const std::string & s0, const std::string & s1) {
    const size_t len0 = s0.size() + 1;
    const size_t len1 = s1.size() + 1;
    std::vector<int> col(len1, 0);
    std::vector<int> prevCol(len1, 0);
    for (size_t i = 0; i < len1; i++) {
        prevCol[i] = i;
    }
    for (size_t i = 0; i < len0; i++) {
        col[0] = i;
        for (size_t j = 1; j < len1; j++) {
            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
        }
        col.swap(prevCol);
    }
    const float dist = prevCol[len1 - 1];
    return 1.0f - (dist / std::max(s0.size(), s1.size()));
 }
 std::vector<std::string> read_allowed_commands(const std::string & fname) {
    std::vector<std::string> allowed_commands;
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -0,0 +1,241 @@
 #include "common-ggml.h"
 #include <regex>
 #include <map>
 static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
    {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
 };
 void ggml_print_ftypes(FILE * fp) {
    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
    }
 }
 enum ggml_ftype ggml_parse_ftype(const char * str) {
    enum ggml_ftype ftype;
    if (str[0] == 'q') {
        const auto it = GGML_FTYPE_MAP.find(str);
        if (it == GGML_FTYPE_MAP.end()) {
            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
            return GGML_FTYPE_UNKNOWN;
        }
        ftype = it->second;
    } else {
        ftype = (enum ggml_ftype) atoi(str);
    }
    return ftype;
 }
 bool ggml_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
        const ggml_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip) {
    ggml_type qtype = GGML_TYPE_F32;
    switch (ftype) {
        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
        case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
        case GGML_FTYPE_UNKNOWN:
        case GGML_FTYPE_ALL_F32:
        case GGML_FTYPE_MOSTLY_F16:
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
                }
    };
    if (!ggml_is_quantized(qtype)) {
        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
        return false;
    }
    size_t total_size_org = 0;
    size_t total_size_new = 0;
    std::vector<float> work;
    std::vector<uint8_t>     data_u8;
    std::vector<ggml_fp16_t> data_f16;
    std::vector<float>       data_f32;
    std::vector<int64_t> hist_all(1 << 4, 0);
    while (true) {
        int32_t n_dims;
        int32_t length;
        int32_t ttype;
        finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
        finp.read(reinterpret_cast<char *>(&length), sizeof(length));
        finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
        if (finp.eof()) {
            break;
        }
        int32_t nelements = 1;
        int32_t ne[2] = { 1, 1 };
        for (int i = 0; i < n_dims; ++i) {
            finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
            nelements *= ne[i];
        }
        std::string name(length, 0);
        finp.read (&name[0], length);
        printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype));
        bool quantize = false;
        // check if we should quantize this tensor
        for (const auto & s : to_quant) {
            if (std::regex_match(name, std::regex(s))) {
                quantize = true;
                break;
            }
        }
        // check if we should skip this tensor
        for (const auto & s : to_skip) {
            if (std::regex_match(name, std::regex(s))) {
                quantize = false;
                break;
            }
        }
        // quantize only 2D tensors
        quantize &= (n_dims == 2);
        if (quantize) {
            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
                return false;
            }
            if (ttype == GGML_TYPE_F16) {
                data_f16.resize(nelements);
                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
                data_f32.resize(nelements);
                for (int i = 0; i < nelements; ++i) {
                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
                }
            } else {
                data_f32.resize(nelements);
                finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
            }
            ttype = qtype;
        } else {
            const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
            data_u8.resize(nelements*bpe);
            finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
        }
        fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
        fout.write(reinterpret_cast<char *>(&length), sizeof(length));
        fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
        for (int i = 0; i < n_dims; ++i) {
            fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
        }
        fout.write(&name[0], length);
        if (quantize) {
            work.resize(nelements); // for quantization
            size_t cur_size = 0;
            std::vector<int64_t> hist_cur(1 << 4, 0);
            switch ((ggml_type) ttype) {
                case GGML_TYPE_Q4_0:
                    {
                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_Q4_1:
                    {
                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_Q4_2:
                    {
                        cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_Q5_0:
                    {
                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_Q5_1:
                    {
                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_Q8_0:
                    {
                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
                case GGML_TYPE_I8:
                case GGML_TYPE_I16:
                case GGML_TYPE_I32:
                case GGML_TYPE_Q8_1:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
                        return false;
                    }
            }
            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
            total_size_new += cur_size;
            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
            for (int i = 0; i < hist_cur.size(); ++i) {
                hist_all[i] += hist_cur[i];
            }
            for (int i = 0; i < hist_cur.size(); ++i) {
                printf("%5.3f ", hist_cur[i] / (float)nelements);
            }
            printf("\n");
        } else {
            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
            total_size_new += data_u8.size();
        }
        total_size_org += nelements * sizeof(float);
    }
    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
    {
        int64_t sum_all = 0;
        for (int i = 0; i < hist_all.size(); ++i) {
            sum_all += hist_all[i];
        }
        printf("%s: hist: ", __func__);
        for (int i = 0; i < hist_all.size(); ++i) {
            printf("%5.3f ", hist_all[i] / (float)sum_all);
        }
        printf("\n");
    }
    return true;
 }
--- a/examples/common-ggml.h
+++ b/examples/common-ggml.h
@ -0,0 +1,18 @@
 #pragma once
 #include "ggml.h"
 #include <fstream>
 #include <vector>
 #include <string>
 enum ggml_ftype ggml_parse_ftype(const char * str);
 void ggml_print_ftypes(FILE * fp = stderr);
 bool ggml_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
        const ggml_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip);
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -6,12 +6,86 @@
 #include "dr_wav.h"
 #include <cmath>
 #include <fstream>
 #include <regex>
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
 #endif
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
        if (arg == "-s" || arg == "--seed") {
            params.seed = std::stoi(argv[++i]);
        } else if (arg == "-t" || arg == "--threads") {
            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "-p" || arg == "--prompt") {
            params.prompt = argv[++i];
        } else if (arg == "-n" || arg == "--n_predict") {
            params.n_predict = std::stoi(argv[++i]);
        } else if (arg == "--top_k") {
            params.top_k = std::stoi(argv[++i]);
        } else if (arg == "--top_p") {
            params.top_p = std::stof(argv[++i]);
        } else if (arg == "--temp") {
            params.temp = std::stof(argv[++i]);
        } else if (arg == "-b" || arg == "--batch_size") {
            params.n_batch = std::stoi(argv[++i]);
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, params);
            exit(0);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, params);
            exit(0);
        }
    }
    return true;
 }
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
 }
 std::string gpt_random_prompt(std::mt19937 & rng) {
    const int r = rng() % 10;
    switch (r) {
        case 0: return "So";
        case 1: return "Once upon a time";
        case 2: return "When";
        case 3: return "The";
        case 4: return "After";
        case 5: return "If";
        case 6: return "import";
        case 7: return "He";
        case 8: return "She";
        case 9: return "They";
        default: return "To";
    }
    return "The";
 }
 std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
@ -27,6 +101,251 @@ std::string replace(const std::string & s, const std::string & from, const std::
    return result;
 }
 std::map<std::string, int32_t> json_parse(const std::string & fname) {
    std::map<std::string, int32_t> result;
    // read file into string
    std::string json;
    {
        std::ifstream ifs(fname);
        if (!ifs) {
            fprintf(stderr, "Failed to open %s\n", fname.c_str());
            exit(1);
        }
        json = std::string((std::istreambuf_iterator<char>(ifs)),
                (std::istreambuf_iterator<char>()));
    }
    if (json[0] != '{') {
        return result;
    }
    // parse json
    {
        bool has_key  = false;
        bool in_token = false;
        std::string str_key = "";
        std::string str_val = "";
        int n = json.size();
        for (int i = 1; i < n; ++i) {
            if (!in_token) {
                if (json[i] == ' ') continue;
                if (json[i] == '"') {
                    in_token = true;
                    continue;
                }
            } else {
                if (json[i] == '\\' && i+1 < n) {
                    if (has_key == false) {
                        str_key += json[i];
                    } else {
                        str_val += json[i];
                    }
                    ++i;
                } else if (json[i] == '"') {
                    if (has_key == false) {
                        has_key = true;
                        ++i;
                        while (json[i] == ' ') ++i;
                        ++i; // :
                        while (json[i] == ' ') ++i;
                        if (json[i] != '\"') {
                            while (json[i] != ',' && json[i] != '}') {
                                str_val += json[i++];
                            }
                            has_key = false;
                        } else {
                            in_token = true;
                            continue;
                        }
                    } else {
                        has_key = false;
                    }
                    str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
                    str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
                    str_key = ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
                    try {
                        result[str_key] = std::stoi(str_val);
                    } catch (...) {
                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
                    }
                    str_key = "";
                    str_val = "";
                    in_token = false;
                    continue;
                }
                if (has_key == false) {
                    str_key += json[i];
                } else {
                    str_val += json[i];
                }
            }
        }
    }
    return result;
 }
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
    std::vector<std::string> words;
    // first split the text into words
    {
        std::string str = text;
        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
        std::regex re(pat);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            for (auto x : m) {
                words.push_back(x);
            }
            str = m.suffix();
        }
    }
    // find the longest tokens that form the words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
        if (word.size() == 0) continue;
        int i = 0;
        int n = word.size();
        while (i < n) {
            int j = n;
            while (j > i) {
                auto it = vocab.token_to_id.find(word.substr(i, j-i));
                if (it != vocab.token_to_id.end()) {
                    tokens.push_back(it->second);
                    i = j;
                    break;
                }
                --j;
            }
            if (i == n) {
                break;
            }
            if (j == i) {
                auto sub = word.substr(i, 1);
                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
                    tokens.push_back(vocab.token_to_id.at(sub));
                } else {
                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
                }
                ++i;
            }
        }
    }
    return tokens;
 }
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
    vocab.token_to_id = ::json_parse(fname);
    for (const auto & kv : vocab.token_to_id) {
        vocab.id_to_token[kv.second] = kv.first;
    }
    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
    // print the vocabulary
    //for (auto kv : vocab.token_to_id) {
    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
    //}
    return true;
 }
 gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const float * logits,
        int    top_k,
        double top_p,
        double temp,
        std::mt19937 & rng) {
    int n_logits = vocab.id_to_token.size();
    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);
    {
        const double scale = 1.0/temp;
        for (int i = 0; i < n_logits; ++i) {
            logits_id.push_back(std::make_pair(logits[i]*scale, i));
        }
    }
    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });
    logits_id.resize(top_k);
    double maxl = -INFINITY;
    for (const auto & kv : logits_id) {
        maxl = std::max(maxl, kv.first);
    }
    // compute probs for the top K tokens
    std::vector<double> probs;
    probs.reserve(logits_id.size());
    double sum = 0.0;
    for (const auto & kv : logits_id) {
        double p = exp(kv.first - maxl);
        probs.push_back(p);
        sum += p;
    }
    // normalize the probs
    for (auto & p : probs) {
        p /= sum;
    }
    if (top_p < 1.0f) {
        double cumsum = 0.0f;
        for (int i = 0; i < top_k; i++) {
            cumsum += probs[i];
            if (cumsum >= top_p) {
                top_k = i + 1;
                probs.resize(top_k);
                logits_id.resize(top_k);
                break;
            }
        }
        cumsum = 1.0/cumsum;
        for (int i = 0; i < (int) probs.size(); i++) {
            probs[i] *= cumsum;
        }
    }
    //printf("\n");
    //for (int i = 0; i < (int) probs.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
    //}
    //exit(0);
    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);
    return logits_id[idx].second;
 }
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    drwav wav;
    std::vector<uint8_t> wav_data; // used for pipe input from stdin
@ -160,3 +479,27 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
    return true;
 }
 float similarity(const std::string & s0, const std::string & s1) {
    const size_t len0 = s0.size() + 1;
    const size_t len1 = s1.size() + 1;
    std::vector<int> col(len1, 0);
    std::vector<int> prevCol(len1, 0);
    for (size_t i = 0; i < len1; i++) {
        prevCol[i] = i;
    }
    for (size_t i = 0; i < len0; i++) {
        col[0] = i;
        for (size_t j = 1; j < len1; j++) {
            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
        }
        col.swap(prevCol);
    }
    const float dist = prevCol[len1 - 1];
    return 1.0f - (dist / std::max(s0.size(), s1.size()));
 }
--- a/examples/common.h
+++ b/examples/common.h
@ -1,10 +1,44 @@
 // Various helper functions and utilities
 #pragma once
-// needs to match WHISPER_SAMPLE_RATE
+#include <string>
 #include <map>
 #include <vector>
 #include <random>
 #include <thread>
 #define COMMON_SAMPLE_RATE 16000
-#include <vector>
+//
-#include <string>
+// CLI argument parsing
 //
 struct gpt_params {
    int32_t seed      = -1; // RNG seed
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_predict = 200; // new tokens to predict
    // sampling parameters
    int32_t top_k = 40;
    float   top_p = 0.9f;
    float   temp  = 0.9f;
    int32_t n_batch = 8; // batch size for prompt processing
    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
    std::string prompt;
 };
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 std::string gpt_random_prompt(std::mt19937 & rng);
 //
 // Vocab utils
 //
 std::string trim(const std::string & s);
@ -13,6 +47,52 @@ std::string replace(
        const std::string & from,
        const std::string & to);
 struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;
    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
 };
 // poor-man's JSON parsing
 std::map<std::string, int32_t> json_parse(const std::string & fname);
 // split text into tokens
 //
 // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
 //
 // Regex (Python):
 // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
 //
 // Regex (C++):
 // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
 //
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
 // sample next token given probabilities for each embedding
 //
 //   - consider only the top K tokens
 //   - from them, consider only the top tokens with cumulative probability > P
 //
 // TODO: not sure if this implementation is correct
 // TODO: temperature is not implemented
 //
 gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const float * logits,
        int    top_k,
        double top_p,
        double temp,
        std::mt19937 & rng);
 //
 // Audio utils
 //
 // Read WAV audio file and store the PCM data into pcmf32
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
@ -38,3 +118,5 @@ bool vad_simple(
        float freq_thold,
        bool  verbose);
 // compute similarity between two strings using Levenshtein distance
 float similarity(const std::string & s0, const std::string & s1);
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -145,7 +145,15 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
                            var db = event.target.result;
                            var tx = db.transaction(['models'], 'readwrite');
                            var os = tx.objectStore('models');
                            var rq = null;
                            try {
                                var rq = os.put(data, url);
                            } catch (e) {
                                cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB: \n' + e);
                                cbCancel();
                                return;
                            }
                            rq.onsuccess = function (event) {
                                cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
@ -180,7 +188,6 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
    rq.onabort = function (event) {
        cbPrint('loadRemote: failed to open IndexedDB: abort');
-
+        cbCancel();
    };
 }
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -352,6 +352,37 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }
 char *escape_double_quotes_and_backslashes(const char *str) {
    if (str == NULL) {
        return NULL;
    }
    size_t escaped_length = strlen(str) + 1;
    for (size_t i = 0; str[i] != '\0'; i++) {
        if (str[i] == '"' || str[i] == '\\') {
            escaped_length++;
        }
    }
    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
    if (escaped == NULL) {
        return NULL;
    }
    size_t pos = 0;
    for (size_t i = 0; str[i] != '\0'; i++) {
        if (str[i] == '"' || str[i] == '\\') {
            escaped[pos++] = '\\';
        }
        escaped[pos++] = str[i];
    }
    // no need to set zero due to calloc() being used prior
    return escaped;
 }
 bool output_csv(struct whisper_context * ctx, const char * fname) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
@ -367,47 +398,15 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
        char * text_escaped = escape_double_quotes_and_backslashes(text);
        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << "," << 10 * t1 << ",\"" << text    << "\"\n";
+        fout << 10 * t0 << "," << 10 * t1 << ",\"" << text_escaped    << "\"\n";
    }
    return true;
 }
 char *escape_double_quotes(const char *str) {
    if (str == NULL) {
        return NULL;
    }
    size_t escaped_length = strlen(str) + 1;
    for (size_t i = 0; str[i] != '\0'; i++) {
        if (str[i] == '"') {
            escaped_length++;
        }
    }
    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
    if (escaped == NULL) {
        return NULL;
    }
    size_t pos = 0;
    for (size_t i = 0; str[i] != '\0'; i++) {
        if (str[i] == '"') {
            escaped[pos++] = '\\';
            escaped[pos++] = '"';
        } else {
            escaped[pos++] = str[i];
        }
    }
    // no need to set zero due to calloc() being used prior
    return escaped;
 }
 bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
    std::ofstream fout(fname);
    int indent = 0;
@ -451,7 +450,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
    auto value_s = [&](const char *name, const char *val, bool end = false) {
        start_value(name);
-        char * val_escaped = escape_double_quotes(val);
+        char * val_escaped = escape_double_quotes_and_backslashes(val);
        fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
        free(val_escaped);
    };
@ -497,7 +496,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
                value_i("layer", whisper_model_n_text_layer(ctx), true);
            end_obj();
            value_i("mels", whisper_model_n_mels(ctx));
-            value_i("f16", whisper_model_f16(ctx), true);
+            value_i("ftype", whisper_model_ftype(ctx), true);
        end_obj();
        start_obj("params");
            value_s("model", params.model.c_str());
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@ -0,0 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 include(DefaultTargetOptions)
 target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@ -0,0 +1,3 @@
 # quantize
 Tool for integer quantization of Whisper `ggml` model files
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -0,0 +1,215 @@
 #include "ggml.h"
 #include "common.h"
 #include "common-ggml.h"
 #include <cassert>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <map>
 #include <string>
 #include <vector>
 #include <regex>
 // default hparams (Whisper tiny)
 struct whisper_hparams {
    int32_t n_vocab       = 51864;
    int32_t n_audio_ctx   = 1500;
    int32_t n_audio_state = 384;
    int32_t n_audio_head  = 6;
    int32_t n_audio_layer = 4;
    int32_t n_text_ctx    = 448;
    int32_t n_text_state  = 384;
    int32_t n_text_head   = 6;
    int32_t n_text_layer  = 4;
    int32_t n_mels        = 80;
    int32_t f16           = 1;
 };
 struct whisper_filters {
    int32_t n_mel;
    int32_t n_fft;
    std::vector<float> data;
 };
 // quantize a model
 bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
    gpt_vocab vocab;
    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
    auto finp = std::ifstream(fname_inp, std::ios::binary);
    if (!finp) {
        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
        return false;
    }
    auto fout = std::ofstream(fname_out, std::ios::binary);
    if (!fout) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
        return false;
    }
    // verify magic
    {
        uint32_t magic;
        finp.read((char *) &magic, sizeof(magic));
        if (magic != 0x67676d6c) {
            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
            return false;
        }
        fout.write((char *) &magic, sizeof(magic));
    }
    whisper_hparams hparams;
    // load hparams
    {
        finp.read((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
        finp.read((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
        finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
        finp.read((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
        finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
        finp.read((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
        finp.read((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
        finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
        finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
        finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
        finp.read((char *) &hparams.f16,           sizeof(hparams.f16));
        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
        fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
        fprintf(stderr, "%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
        fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
        fprintf(stderr, "%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
        fprintf(stderr, "%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
        fprintf(stderr, "%s: f16           = %d\n", __func__, hparams.f16);
        fout.write((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
        fout.write((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
        fout.write((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
        fout.write((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
        fout.write((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
        fout.write((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
        fout.write((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
        fout.write((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
        fout.write((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
        fout.write((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
        fout.write((char *) &ftype,                 sizeof(hparams.f16));
    }
    // load mel filters
    {
        whisper_filters filters;
        finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
        fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
        finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
        fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
        filters.data.resize(filters.n_mel * filters.n_fft);
        finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
        fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
    }
    // load vocab
    {
        int32_t n_vocab = 0;
        finp.read ((char *) &n_vocab, sizeof(n_vocab));
        fout.write((char *) &n_vocab, sizeof(n_vocab));
        //if (n_vocab != hparams.n_vocab) {
        //    fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
        //            __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
        //    return false;
        //}
        std::string word;
        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
            finp.read ((char *) &len, sizeof(len));
            fout.write((char *) &len, sizeof(len));
            word.resize(len);
            finp.read ((char *) word.data(), len);
            fout.write((char *) word.data(), len);
            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
        }
    }
    // regexes of tensor names to not be quantized
    const std::vector<std::string> to_skip = {
        //"encoder.*",
        "encoder.conv1.bias",
        "encoder.conv2.bias",
        "encoder.positional_embedding",
        "decoder.positional_embedding",
    };
    if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
        return false;
    }
    finp.close();
    fout.close();
    return true;
 }
 int main(int argc, char ** argv) {
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
        ggml_print_ftypes(stderr);
        return 1;
    }
    // needed to initialize f16 tables
    {
        struct ggml_init_params params = { 0, NULL, false };
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
    }
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];
    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
    const int64_t t_main_start_us = ggml_time_us();
    int64_t t_quantize_us = 0;
    // load the model
    {
        const int64_t t_start_us = ggml_time_us();
        if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
        t_quantize_us = ggml_time_us() - t_start_us;
    }
    // report timing
    {
        const int64_t t_main_end_us = ggml_time_us();
        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
    }
    return 0;
 }
--- a/examples/stream.wasm/index-tmpl.html
+++ b/examples/stream.wasm/index-tmpl.html
@ -35,6 +35,15 @@
            <br><br>
            <b>More examples:</b>
                <a href="https://whisper.ggerganov.com/">main</a> |
                <a href="https://whisper.ggerganov.com/bench">bench</a> |
                <a href="https://whisper.ggerganov.com/stream">stream</a> |
                <a href="https://whisper.ggerganov.com/command">command</a> |
                <a href="https://whisper.ggerganov.com/talk">talk</a> |
            <br><br>
            <hr>
            Select the model you would like to use, click the "Start" button and start speaking
@ -45,6 +54,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <!--
@ -162,11 +175,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };
                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
                    'tiny-en-q5_1':   31,
                    'base-en-q5_1':   57,
                };
                let url     = urls[model];
@ -177,6 +196,10 @@
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
@ -188,6 +211,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };
--- a/examples/stream/CMakeLists.txt
+++ b/examples/stream/CMakeLists.txt
@ -1,4 +1,4 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # stream
    set(TARGET stream)
    add_executable(${TARGET} stream.cpp)
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -383,6 +383,7 @@ int main(int argc, char ** argv) {
                    }
                }
            }
            fflush(stdout);
        }
    }
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,4 +1,4 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
    #add_executable(${TARGET} talk-llama.cpp llama.cpp)
--- a/examples/talk-llama/llama-util.h
+++ b/examples/talk-llama/llama-util.h
@ -21,12 +21,17 @@
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
        #endif
        #if defined(_POSIX_MEMLOCK_RANGE)
            #include <sys/resource.h>
        #endif
    #endif
 #endif
 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h> // for _fseeki64
@ -41,8 +46,12 @@
    } while (0)
 #ifdef __GNUC__
 #ifdef __MINGW32__
 __attribute__((format(gnu_printf, 1, 2)))
 #else
 __attribute__((format(printf, 1, 2)))
 #endif
 #endif
 static std::string format(const char * fmt, ...) {
    va_list ap, ap2;
    va_start(ap, fmt);
@ -55,7 +64,7 @@ static std::string format(const char * fmt, ...) {
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
-};
+}
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
@ -162,7 +171,7 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;
-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
@ -170,17 +179,18 @@ struct llama_mmap {
        flags |= MAP_POPULATE;
 #endif
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
        close(fd);
        if (addr == MAP_FAILED) {
            throw format("mmap failed: %s", strerror(errno));
        }
        if (prefetch) {
            // Advise the kernel to preload the mapped memory
            if (madvise(addr, file->size, MADV_WILLNEED)) {
                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
        }
    }
    ~llama_mmap() {
        munmap(addr, size);
@ -188,14 +198,13 @@ struct llama_mmap {
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;
        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
        DWORD error = GetLastError();
        CloseHandle(hFile);
        if (hMapping == NULL) {
            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@ -209,6 +218,8 @@ struct llama_mmap {
            throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
        }
        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
        if (prefetch) {
            // Advise the kernel to preload the mapped memory
            WIN32_MEMORY_RANGE_ENTRY range;
            range.VirtualAddress = addr;
@ -218,6 +229,10 @@ struct llama_mmap {
                        llama_format_win_err(GetLastError()).c_str());
            }
        }
        #else
        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
    }
    ~llama_mmap() {
        if (!UnmapViewOfFile(addr)) {
@ -291,8 +306,18 @@ struct llama_mlock {
        if (!mlock(addr, size)) {
            return true;
        } else {
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
+            char* errmsg = std::strerror(errno);
-                    size, this->size, std::strerror(errno));
+            bool suggest = (errno == ENOMEM);
            // Check if the resource limit is fine after all
            struct rlimit lock_limit;
            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
                suggest = false;
            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
                suggest = false;
            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
            return false;
        }
    }
@ -338,8 +363,8 @@ struct llama_mlock {
            // Hopefully a megabyte is enough overhead:
            size_t increment = size + 1048576;
            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += size;
+            min_ws_size += increment;
-            max_ws_size += size;
+            max_ws_size += increment;
            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
                        llama_format_win_err(GetLastError()).c_str());
@ -380,4 +405,29 @@ struct llama_buffer {
        delete[] addr;
    }
 };
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 struct llama_ctx_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;
    void resize(size_t size) {
        if (addr) {
            ggml_cuda_host_free(addr);
        }
        addr = (uint8_t *) ggml_cuda_host_malloc(size);
        this->size = size;
    }
    ~llama_ctx_buffer() {
        if (addr) {
            ggml_cuda_host_free(addr);
        }
    }
 };
 #else
 typedef llama_buffer llama_ctx_buffer;
 #endif
 #endif
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -39,12 +39,16 @@ extern "C" {
    typedef struct llama_token_data {
        llama_token id;  // token id
-
+        float logit; // log-odds of the token
        float p;     // probability of the token
        float plog;  // log probability of the token
    } llama_token_data;
    typedef struct llama_token_data_array {
        llama_token_data * data;
        size_t size;
        bool sorted;
    } llama_token_data_array;
    typedef void (*llama_progress_callback)(float progress, void *ctx);
    struct llama_context_params {
@ -65,6 +69,20 @@ extern "C" {
        void * progress_callback_user_data;
    };
    // model file types
    enum llama_ftype {
        LLAMA_FTYPE_ALL_F32     = 0,
        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
    };
    LLAMA_API struct llama_context_params llama_context_default_params();
    LLAMA_API bool llama_mmap_supported();
@ -82,27 +100,46 @@ extern "C" {
    // TODO: not great API - very likely to change
    // Returns 0 on success
    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-                   int   itype);
+      enum llama_ftype   ftype,
            int          nthread);
-    // Returns the KV cache that will contain the context for the
+    // Apply a LoRA adapter to a loaded model
-    // ongoing prediction with the model.
+    // path_base_model is the path to a higher quality model to use as a base for
-    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-    // Returns the size of the KV cache
+    // will be applied on top of the previous one
-    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+    // Returns 0 on success
    LLAMA_API int llama_apply_lora_from_file(
            struct llama_context * ctx,
                      const char * path_lora,
                      const char * path_base_model,
                             int   n_threads);
    // Returns the number of tokens in the KV cache
    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
-    // Sets the KV cache containing the current context for the model
+    // Sets the current rng seed.
-    LLAMA_API void llama_set_kv_cache(
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
-            struct llama_context * ctx,
+
-                   const uint8_t * kv_cache,
+    // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
-                          size_t   n_size,
+    LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
-                             int   n_token_count);
+
    // Copies the state to the specified destination address.
    // Destination needs to have allocated enough memory.
    // Returns the number of bytes copied
    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
    // Set the state reading from the specified address
    // Returns the number of bytes read
    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
    // Save/load session file
    LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
    LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
@ -148,16 +185,52 @@ extern "C" {
    // Special tokens
    LLAMA_API llama_token llama_token_bos();
    LLAMA_API llama_token llama_token_eos();
    LLAMA_API llama_token llama_token_nl();
-    // TODO: improve the last_n_tokens interface ?
+    // Sampling functions
-    LLAMA_API llama_token llama_sample_top_p_top_k(
+
-       struct llama_context * ctx,
+    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-          const llama_token * last_n_tokens_data,
+    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
-                        int   last_n_tokens_size,
+
-                        int   top_k,
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-                      float   top_p,
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
-                      float   temp,
+
-                      float   repeat_penalty);
+    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
    /// @details Selects the token with the highest probability.
    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
    /// @details Randomly selects a token from the candidates based on their probabilities.
    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
    // Performance information
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
@ -170,4 +243,15 @@ extern "C" {
 }
 #endif
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
 #include <vector>
 #include <string>
 struct ggml_tensor;
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif
 #endif // LLAMA_H
--- a/examples/talk-llama/llama_internal.h
+++ b/examples/talk-llama/llama_internal.h
@ -1,12 +0,0 @@
 // Internal header to be included by llama.cpp and tests/benchmarks only.
 #ifndef LLAMA_INTERNAL_H
 #define LLAMA_INTERNAL_H
 #include <vector>
 #include <string>
 struct ggml_tensor;
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif // LLAMA_INTERNAL_H
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -487,11 +487,37 @@ int main(int argc, char ** argv) {
                        {
                            auto logits = llama_get_logits(ctx_llama);
                            auto n_vocab = llama_n_vocab(ctx_llama);
                            logits[llama_token_eos()] = 0;
-                            id = llama_sample_top_p_top_k(ctx_llama,
+                            std::vector<llama_token_data> candidates;
                            candidates.reserve(n_vocab);
                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
                            }
                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
                            // apply repeat penalty
                            const float nl_logit = logits[llama_token_nl()];
                            llama_sample_repetition_penalty(ctx_llama, &candidates_p,
                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
-                                    repeat_last_n, top_k, top_p, temp, repeat_penalty);
+                                    repeat_last_n, repeat_penalty);
                            logits[llama_token_nl()] = nl_logit;
                            if (temp <= 0) {
                                // Greedy sampling
                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
                            } else {
                                // Temperature sampling
                                llama_sample_top_k(ctx_llama, &candidates_p, top_k);
                                llama_sample_top_p(ctx_llama, &candidates_p, top_p);
                                llama_sample_temperature(ctx_llama, &candidates_p, temp);
                                id = llama_sample_token(ctx_llama, &candidates_p);
                            }
                        }
                        if (id != llama_token_eos()) {
--- a/examples/talk.wasm/CMakeLists.txt
+++ b/examples/talk.wasm/CMakeLists.txt
@ -13,6 +13,7 @@ include(DefaultTargetOptions)
 target_link_libraries(${TARGET} PRIVATE
    whisper
    common
    )
 unset(EXTRA_FLAGS)
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -1,4 +1,6 @@
 #include "ggml.h"
 #include "common-ggml.h"
 #include "gpt-2.h"
 #include <cmath>
@ -14,150 +16,6 @@
 /////////////////////// GPT-2 BEGIN /////////////////////////
 //
 // Vocab utils
 //
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
    std::vector<std::string> words;
    // first split the text into words
    {
        std::string str = text;
        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
        std::regex re(pat);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            for (auto x : m) {
                words.push_back(x);
            }
            str = m.suffix();
        }
    }
    // find the longest tokens that form the words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
        if (word.size() == 0) continue;
        int i = 0;
        int n = word.size();
        while (i < n) {
            int j = n;
            while (j > i) {
                auto it = vocab.token_to_id.find(word.substr(i, j-i));
                if (it != vocab.token_to_id.end()) {
                    tokens.push_back(it->second);
                    i = j;
                    break;
                }
                --j;
            }
            if (i == n) {
                break;
            }
            if (j == i) {
                auto sub = word.substr(i, 1);
                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
                    tokens.push_back(vocab.token_to_id.at(sub));
                } else {
                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
                }
                ++i;
            }
        }
    }
    return tokens;
 }
 gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const float * logits,
        int    top_k,
        double top_p,
        double temp,
        std::mt19937 & rng) {
    int n_logits = vocab.id_to_token.size();
    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);
    for (int i = 0; i < n_logits; i++) {
        logits_id.push_back(std::make_pair(logits[i], i));
    }
    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });
    logits_id.resize(top_k);
    // normalize
    {
        double sum = 0.0f;
        for (int i = 0; i < (int)logits_id.size(); i++) {
            sum += logits_id[i].first;
        }
        sum = 1.0/sum;
        for (int i = 0; i < (int)logits_id.size(); i++) {
            logits_id[i].first *= sum;
        }
    }
    if (top_p < 1.0f) {
        {
            double cumsum = 0.0f;
            for (int i = 0; i < top_k; i++) {
                cumsum += logits_id[i].first;
                if (cumsum >= top_p) {
                    logits_id.resize(i+1);
                    break;
                }
            }
        }
        // normalize again
        {
            double sum = 0.0f;
            for (int i = 0; i < (int)logits_id.size(); i++) {
                sum += logits_id[i].first;
            }
            sum = 1.0/sum;
            for (int i = 0; i < (int)logits_id.size(); i++) {
                logits_id[i].first *= sum;
            }
        }
    }
    //printf("\n");
    //for (int i = 0; i < (int)logits_id.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
    //}
    //exit(0);
    // sample from the obtained distribution
    std::vector<double> probs;
    probs.reserve(logits_id.size());
    for (int i = 0; i < (int) logits_id.size(); i++) {
        probs.push_back(logits_id[i].first);
    }
    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);
    return logits_id[idx].second;
 }
 // default hparams (GPT-2 117M)
 struct gpt2_hparams {
    int32_t n_vocab = 50257;
@ -165,7 +23,7 @@ struct gpt2_hparams {
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };
 struct gpt2_layer {
@ -187,7 +45,7 @@ struct gpt2_layer {
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;
-    struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
+    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
 };
@ -200,6 +58,7 @@ struct gpt2_model {
    struct ggml_tensor * wte;     // position embedding
    struct ggml_tensor * wpe;     //    token embedding
    struct ggml_tensor * lm_head; // language model head
    std::vector<gpt2_layer> layers;
@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    }
    // load vocab
@ -275,9 +134,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        }
    }
-    // for the big tensors, we have the option to store the data in 16-bit floats
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
    if (wtype == GGML_TYPE_COUNT) {
        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                __func__, fname.c_str(), model.hparams.ftype);
        return false;
    }
    auto & ctx = model.ctx;
@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-        ctx_size += n_vocab*n_embd*ggml_type_size(wtype);         // wte
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_size(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
        ctx_size += (6 + 12*n_layer)*256; // object overhead
@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    // create the ggml context
    {
-        struct ggml_init_params params;
+        struct ggml_init_params params = {
-        params.mem_size   = ctx_size;
+            .mem_size   = ctx_size,
-        params.mem_buffer = NULL;
+            .mem_buffer = NULL,
            .no_alloc   = false,
        };
        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -352,6 +219,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        // map by name
        model.tensors["model/ln_f/g"] = model.ln_f_g;
@ -359,6 +227,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.tensors["model/wte"]     = model.wte;
        model.tensors["model/wpe"]     = model.wpe;
        model.tensors["model/lm_head"] = model.lm_head;
        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];
@ -369,16 +238,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            // map by name
@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
        }
    }
@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    {
        size_t total_size = 0;
        bool has_lm_head = false;
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;
            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
            if (fin.eof()) {
                break;
@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                return false;
            }
-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            // for debugging
            if (0) {
                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
            }
-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            // GPT-2 models share the WTE tensor as the LM head
            if (name == "model/wte" && has_lm_head == false) {
                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
            }
            if (name == "model/lm_head") {
                has_lm_head = true;
            }
            total_size += ggml_nbytes(tensor);
        }
@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted probabilities of the next token
+//   - embd_w:    the predicted logits for the next token
 //
 bool gpt2_eval(
        const gpt2_model & model,
@ -512,12 +396,12 @@ bool gpt2_eval(
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;
-    static size_t buf_size = 640u*1024*1024;
+    static size_t buf_size = 512u*1024*1024;
    static void * buf = malloc(buf_size);
    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
        // reallocate
        buf_size = buf_size_new;
@ -528,12 +412,13 @@ bool gpt2_eval(
        }
    }
-    struct ggml_init_params params;
+    struct ggml_init_params params = {
-    params.mem_size   = buf_size;
+        /*.mem_size   =*/ buf_size,
-    params.mem_buffer = buf;
+        /*.mem_buffer =*/ buf,
        /*.no_alloc   =*/ false,
    };
    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph gf = {};
    gf.n_threads = n_threads;
@ -578,7 +463,7 @@ bool gpt2_eval(
        // [2304, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
+                    model.layers[il].c_attn_attn_w,
                    cur);
            cur = ggml_add(ctx0,
@ -654,11 +539,13 @@ bool gpt2_eval(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            // [n_past + N, 64, 12]
            struct ggml_tensor * V_trans =
                ggml_cpy(ctx0,
                        ggml_permute(ctx0,
                            ggml_reshape_3d(ctx0,
                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                            1, 2, 0, 3),
                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
            // KQV = transpose(V) * KQ_soft_max
            // [64, N, 12]
@ -685,7 +572,7 @@ bool gpt2_eval(
        // [768, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    model.layers[il].c_attn_proj_w,
                    cur);
            cur = ggml_add(ctx0,
@ -722,7 +609,7 @@ bool gpt2_eval(
            // cur = fc_w*cur + fc_b
            // [3072, N]
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    model.layers[il].c_mlp_fc_w,
                    cur);
            cur = ggml_add(ctx0,
@ -742,7 +629,7 @@ bool gpt2_eval(
            // cur = proj_w*cur + proj_b
            // [768, N]
            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w_trans,
+                    model.layers[il].c_mlp_proj_w,
                    cur);
            cur = ggml_add(ctx0,
@ -769,12 +656,12 @@ bool gpt2_eval(
    }
    // inpL = WTE * inpL
-    // [ 768, 50257] - model.wte
+    // [ 768, 50257] - model.lm_head
    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.wte, inpL);
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
    // logits -> probs
-    inpL = ggml_soft_max(ctx0, inpL);
+    //inpL = ggml_soft_max(ctx0, inpL);
    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@ -788,7 +675,7 @@ bool gpt2_eval(
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-    // return result for just the last token
+    // return result just for the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
@ -825,7 +712,7 @@ Me too.
    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
    // sampling parameters
-    int32_t top_k = 40;
+    int32_t top_k = 5;
    float   top_p = 0.9f;
    float   temp  = 1.0f;
 };
@ -833,14 +720,14 @@ Me too.
 struct gpt2_context * gpt2_init(const char * path_model) {
    gpt2_context * ctx = new gpt2_context;
-    ctx->rng = std::mt19937(time(NULL));
+    ctx->rng = std::mt19937(time(nullptr));
    // load the model
    {
        const int64_t t_start_us = ggml_time_us();
        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
            delete ctx;
            return nullptr;
        }
@ -885,9 +772,9 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
    std::string result;
-    for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
+    for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
        // predict
-        if (embd.size() > 0) {
+        if (!embd.empty()) {
            if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
                printf("gpt-2: failed to generate text\n");
                return "";
@ -914,10 +801,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
        result += ctx->vocab.id_to_token[embd[0]];
        // end of text token
-        if (embd.back() == 50256 ||
+        if (embd.back() == 50256) {
            ctx->vocab.id_to_token[embd.back()] == "." ||
            ctx->vocab.id_to_token[embd.back()] == "!" ||
            ctx->vocab.id_to_token[embd.back()] == "?") {
            break;
        }
    }
--- a/examples/talk.wasm/gpt-2.h
+++ b/examples/talk.wasm/gpt-2.h
@ -2,18 +2,12 @@
 // TODO: Change to C-style API and move to ./examples for easy reuse.
 #include "common.h"
 #include <vector>
 #include <map>
 #include <string>
 struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;
    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
 };
 struct gpt2_context;
 struct gpt2_context * gpt2_init(const char * path_model);
--- a/examples/talk.wasm/index-tmpl.html
+++ b/examples/talk.wasm/index-tmpl.html
@ -44,6 +44,15 @@
            <br><br>
            <b>More examples:</b>
                <a href="https://whisper.ggerganov.com/">main</a> |
                <a href="https://whisper.ggerganov.com/bench">bench</a> |
                <a href="https://whisper.ggerganov.com/stream">stream</a> |
                <a href="https://whisper.ggerganov.com/command">command</a> |
                <a href="https://whisper.ggerganov.com/talk">talk</a> |
            <br><br>
            <hr>
            Select the models you would like to use and click the "Start" button to begin the conversation
@ -54,6 +63,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <!--
@ -266,11 +279,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };
                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
                    'tiny-en-q5_1':   31,
                    'base-en-q5_1':   57,
                };
                let url     = urls[model];
@ -281,6 +300,10 @@
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
@ -292,6 +315,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };
--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@ -1,16 +1,8 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # talk
    set(TARGET talk)
-    #add_executable(${TARGET} talk.cpp gpt-2.cpp)
+    add_executable(${TARGET} talk.cpp gpt-2.cpp)
-    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
    #target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
    # TODO: this is temporary
    #       need to export ggml symbols for MSVC, but too lazy ..
    add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
    include(DefaultTargetOptions)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -1,4 +1,6 @@
 #include "ggml.h"
 #include "common-ggml.h"
 #include "gpt-2.h"
 #include <cmath>
@ -14,150 +16,6 @@
 /////////////////////// GPT-2 BEGIN /////////////////////////
 //
 // Vocab utils
 //
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
    std::vector<std::string> words;
    // first split the text into words
    {
        std::string str = text;
        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
        std::regex re(pat);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            for (auto x : m) {
                words.push_back(x);
            }
            str = m.suffix();
        }
    }
    // find the longest tokens that form the words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
        if (word.empty()) continue;
        int i = 0;
        int n = word.size();
        while (i < n) {
            int j = n;
            while (j > i) {
                auto it = vocab.token_to_id.find(word.substr(i, j-i));
                if (it != vocab.token_to_id.end()) {
                    tokens.push_back(it->second);
                    i = j;
                    break;
                }
                --j;
            }
            if (i == n) {
                break;
            }
            if (j == i) {
                auto sub = word.substr(i, 1);
                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
                    tokens.push_back(vocab.token_to_id.at(sub));
                } else {
                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
                }
                ++i;
            }
        }
    }
    return tokens;
 }
 gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const float * logits,
        int    top_k,
        double top_p,
        double /*temp*/,
        std::mt19937 & rng) {
    int n_logits = vocab.id_to_token.size();
    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);
    for (int i = 0; i < n_logits; i++) {
        logits_id.emplace_back(logits[i], i);
    }
    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });
    logits_id.resize(top_k);
    // normalize
    {
        double sum = 0.0f;
        for (int i = 0; i < (int)logits_id.size(); i++) {
            sum += logits_id[i].first;
        }
        sum = 1.0/sum;
        for (int i = 0; i < (int)logits_id.size(); i++) {
            logits_id[i].first *= sum;
        }
    }
    if (top_p < 1.0f) {
        {
            double cumsum = 0.0f;
            for (int i = 0; i < top_k; i++) {
                cumsum += logits_id[i].first;
                if (cumsum >= top_p) {
                    logits_id.resize(i+1);
                    break;
                }
            }
        }
        // normalize again
        {
            double sum = 0.0f;
            for (int i = 0; i < (int)logits_id.size(); i++) {
                sum += logits_id[i].first;
            }
            sum = 1.0/sum;
            for (int i = 0; i < (int)logits_id.size(); i++) {
                logits_id[i].first *= sum;
            }
        }
    }
    //printf("\n");
    //for (int i = 0; i < (int) logits_id.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
    //}
    //exit(0);
    // sample from the obtained distribution
    std::vector<double> probs;
    probs.reserve(logits_id.size());
    for (int i = 0; i < (int) logits_id.size(); i++) {
        probs.push_back(logits_id[i].first);
    }
    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);
    return logits_id[idx].second;
 }
 // default hparams (GPT-2 117M)
 struct gpt2_hparams {
    int32_t n_vocab = 50257;
@ -165,7 +23,7 @@ struct gpt2_hparams {
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };
 struct gpt2_layer {
@ -187,7 +45,7 @@ struct gpt2_layer {
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;
-    struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
+    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
 };
@ -200,6 +58,7 @@ struct gpt2_model {
    struct ggml_tensor * wte;     // position embedding
    struct ggml_tensor * wpe;     //    token embedding
    struct ggml_tensor * lm_head; // language model head
    std::vector<gpt2_layer> layers;
@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    }
    // load vocab
@ -268,16 +127,21 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            fin.read((char *) &len, sizeof(len));
            word.resize(len);
-            fin.read((char *) &word[0], len);
+            fin.read((char *) word.data(), len);
            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
        }
    }
-    // for the big tensors, we have the option to store the data in 16-bit floats
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
    if (wtype == GGML_TYPE_COUNT) {
        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                __func__, fname.c_str(), model.hparams.ftype);
        return false;
    }
    auto & ctx = model.ctx;
@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-        ctx_size += n_vocab*n_embd*ggml_type_size(wtype);         // wte
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_size(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
        ctx_size += (6 + 12*n_layer)*256; // object overhead
@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    // create the ggml context
    {
-        struct ggml_init_params params;
+        struct ggml_init_params params = {
-        params.mem_size   = ctx_size;
+            .mem_size   = ctx_size,
-        params.mem_buffer = nullptr;
+            .mem_buffer = NULL,
            .no_alloc   = false,
        };
        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -352,6 +219,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        // map by name
        model.tensors["model/ln_f/g"] = model.ln_f_g;
@ -359,6 +227,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.tensors["model/wte"]     = model.wte;
        model.tensors["model/wpe"]     = model.wpe;
        model.tensors["model/lm_head"] = model.lm_head;
        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];
@ -369,16 +238,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            // map by name
@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
        }
    }
@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    {
        size_t total_size = 0;
        bool has_lm_head = false;
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;
            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
            if (fin.eof()) {
                break;
@ -448,7 +319,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            std::string name(length, 0);
            fin.read(&name[0], length);
-            if (model.tensors.find(name) == model.tensors.end()) {
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
                return false;
            }
@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                return false;
            }
-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            // for debugging
            if (0) {
                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
            }
-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            // GPT-2 models share the WTE tensor as the LM head
            if (name == "model/wte" && has_lm_head == false) {
                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
            }
            if (name == "model/lm_head") {
                has_lm_head = true;
            }
            total_size += ggml_nbytes(tensor);
        }
@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted probabilities of the next token
+//   - embd_w:    the predicted logits for the next token
 //
 bool gpt2_eval(
        const gpt2_model & model,
@ -512,12 +396,12 @@ bool gpt2_eval(
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;
-    static size_t buf_size = 5640ull*1024*1024;
+    static size_t buf_size = 512u*1024*1024;
    static void * buf = malloc(buf_size);
    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
        // reallocate
        buf_size = buf_size_new;
@ -528,12 +412,13 @@ bool gpt2_eval(
        }
    }
-    struct ggml_init_params params;
+    struct ggml_init_params params = {
-    params.mem_size   = buf_size;
+        /*.mem_size   =*/ buf_size,
-    params.mem_buffer = buf;
+        /*.mem_buffer =*/ buf,
        /*.no_alloc   =*/ false,
    };
    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph gf = {};
    gf.n_threads = n_threads;
@ -578,7 +463,7 @@ bool gpt2_eval(
        // [2304, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
+                    model.layers[il].c_attn_attn_w,
                    cur);
            cur = ggml_add(ctx0,
@ -654,11 +539,13 @@ bool gpt2_eval(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            // [n_past + N, 64, 12]
            struct ggml_tensor * V_trans =
                ggml_cpy(ctx0,
                        ggml_permute(ctx0,
                            ggml_reshape_3d(ctx0,
                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                            1, 2, 0, 3),
                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
            // KQV = transpose(V) * KQ_soft_max
            // [64, N, 12]
@ -685,7 +572,7 @@ bool gpt2_eval(
        // [768, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    model.layers[il].c_attn_proj_w,
                    cur);
            cur = ggml_add(ctx0,
@ -722,7 +609,7 @@ bool gpt2_eval(
            // cur = fc_w*cur + fc_b
            // [3072, N]
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    model.layers[il].c_mlp_fc_w,
                    cur);
            cur = ggml_add(ctx0,
@ -742,7 +629,7 @@ bool gpt2_eval(
            // cur = proj_w*cur + proj_b
            // [768, N]
            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w_trans,
+                    model.layers[il].c_mlp_proj_w,
                    cur);
            cur = ggml_add(ctx0,
@ -769,12 +656,12 @@ bool gpt2_eval(
    }
    // inpL = WTE * inpL
-    // [ 768, 50257] - model.wte
+    // [ 768, 50257] - model.lm_head
    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.wte, inpL);
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
    // logits -> probs
-    inpL = ggml_soft_max(ctx0, inpL);
+    //inpL = ggml_soft_max(ctx0, inpL);
    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@ -788,7 +675,7 @@ bool gpt2_eval(
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-    // return result for just the last token
+    // return result just for the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
--- a/examples/talk/gpt-2.h
+++ b/examples/talk/gpt-2.h
@ -2,18 +2,12 @@
 // TODO: Change to C-style API and move to ./examples for easy reuse.
 #include "common.h"
 #include <vector>
 #include <map>
 #include <string>
 struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;
    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
 };
 struct gpt2_context;
 struct gpt2_context * gpt2_init(const char * path_model);
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -9,4 +9,6 @@ To use:
 5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
 [^1]: I recommend the tiny or base models for running on an Android device.
 (PS: Do not move this android project folder individually to other folders, because this android project folder depends on the files of the whole project.)
 <img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -31,9 +31,9 @@ endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
+    -s PTHREAD_POOL_SIZE_STRICT=0 \
-    -s INITIAL_MEMORY=1500MB \
+    -s INITIAL_MEMORY=2000MB \
-    -s TOTAL_MEMORY=1500MB \
+    -s TOTAL_MEMORY=2000MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/whisper.wasm/emscripten.cpp
+++ b/examples/whisper.wasm/emscripten.cpp
@ -10,6 +10,12 @@ std::thread g_worker;
 std::vector<struct whisper_context *> g_contexts(4, nullptr);
 static inline int mpow2(int n) {
    int p = 1;
    while (p <= n) p *= 2;
    return p/2;
 }
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        if (g_worker.joinable()) {
@ -43,7 +49,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        }
    }));
-    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
+    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, int nthreads, bool translate) {
        if (g_worker.joinable()) {
            g_worker.join();
        }
@ -66,7 +72,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        params.print_special    = false;
        params.translate        = translate;
        params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
-        params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
+        params.n_threads        = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency())));
        params.offset_ms        = 0;
        std::vector<float> pcmf32;
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -40,21 +40,42 @@
            Note that the computation is quite heavy and may take a few seconds to complete.<br>
            The transcription results will be displayed in the text area below.<br><br>
-            <b>Important: your browser must support WASM SIMD instructions for this to work.</b>
+            <b>Important:</b>
                <ul>
                    <li>your browser must support WASM SIMD instructions for this to work</li>
                    <li>Firefox cannot load files larger than 256 MB - use Chrome instead</li>
                </ul>
-            <br><br><hr>
+            <b>More examples:</b>
                <a href="https://whisper.ggerganov.com/">main</a> |
                <a href="https://whisper.ggerganov.com/bench">bench</a> |
                <a href="https://whisper.ggerganov.com/stream">stream</a> |
                <a href="https://whisper.ggerganov.com/command">command</a> |
                <a href="https://whisper.ggerganov.com/talk">talk</a> |
            <hr>
            <div id="model">
-                Whisper model: <span id="model-whisper-status"></span>
+                Whisper models: <span id="model-whisper-status"></span><br><br>
                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-tiny"     onclick="loadWhisper('tiny')">tiny (75 MB)</button>
                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <button id="fetch-whisper-base"     onclick="loadWhisper('base')">base (142 MB)</button>
                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                <button id="fetch-whisper-small"    onclick="loadWhisper('small')">small (466 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-tiny-q5_1"      onclick="loadWhisper('tiny-q5_1')">tiny (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <button id="fetch-whisper-base-q5_1"      onclick="loadWhisper('base-q5_1')">base (Q5_1, 57 MB)</button>
                <button id="fetch-whisper-small-en-q5_1"  onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
                <button id="fetch-whisper-small-q5_1"     onclick="loadWhisper('small-q5_1')">small (Q5_1, 182 MB)</button><br>
                <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
                <button id="fetch-whisper-medium-q5_0"    onclick="loadWhisper('medium-q5_0')">medium (Q5_0, 515 MB)</button>
                <button id="fetch-whisper-large-q5_0"     onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
                <span id="fetch-whisper-progress"></span>
            </div>
            <br>
@ -161,6 +182,12 @@
                            <option value="yi">Yiddish</option>
                        </select>
                    </td>
                    <!-- Slider to select number of threads between 1 and 16 -->
                    <td>
                        Threads:
                        <input type="range" id="threads" name="threads" min="1" max="16" value="8" onchange="changeThreads(this.value)" />
                        <span id="threads-value">8</span>
                    </td>
                    <td>
                        <button onclick="onProcess(false);">Transcribe</button>
                    </td>
@ -263,11 +290,13 @@
                Module.FS_createDataFile("/", fname, buf, true, true);
-                model_whisper = fname;
+                //model_whisper = fname;
                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
                document.getElementById('model').innerHTML = 'Model fetched: ' + model_whisper;
            }
            function loadFile(event, fname) {
@ -292,6 +321,17 @@
                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-tiny-q5_1'     ).style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-base-q5_1'     ).style.display = 'none';
                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
                document.getElementById('fetch-whisper-small-q5_1'    ).style.display = 'none';
                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
                document.getElementById('fetch-whisper-medium-q5_0'   ).style.display = 'none';
                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
                document.getElementById('whisper-file'          ).style.display = 'none';
                document.getElementById('model-whisper-status'  ).innerHTML = 'loaded model: ' + file.name;
            }
@ -304,6 +344,16 @@
                    'base':     'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
                    'small':    'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
                    'tiny-q5_1':     'https://whisper.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin',
                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                    'base-q5_1':     'https://whisper.ggerganov.com/ggml-model-whisper-base-q5_1.bin',
                    'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
                    'small-q5_1':    'https://whisper.ggerganov.com/ggml-model-whisper-small-q5_1.bin',
                    'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
                    'medium-q5_0':   'https://whisper.ggerganov.com/ggml-model-whisper-medium-q5_0.bin',
                    'large-q5_0':    'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
                };
                let sizes = {
@ -313,6 +363,16 @@
                    'base':     142,
                    'small.en': 466,
                    'small':    466,
                    'tiny-en-q5_1':   31,
                    'tiny-q5_1':      31,
                    'base-en-q5_1':   57,
                    'base-q5_1':      57,
                    'small-en-q5_1':  182,
                    'small-q5_1':     182,
                    'medium-en-q5_0': 515,
                    'medium-q5_0':    515,
                    'large-q5_0':     1030,
                };
                let url     = urls[model];
@ -327,6 +387,17 @@
                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-tiny-q5_1'     ).style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-base-q5_1'     ).style.display = 'none';
                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
                document.getElementById('fetch-whisper-small-q5_1'    ).style.display = 'none';
                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
                document.getElementById('fetch-whisper-medium-q5_0'   ).style.display = 'none';
                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
                document.getElementById('whisper-file'        ).style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading model: ' + model;
@ -337,12 +408,24 @@
                cbCancel = function() {
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small'   ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'  ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-q5_1'     ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q5_1'  ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-q5_1'     ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-q5_1'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-medium-q5_0'   ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-large-q5_0'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('whisper-file'        ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
                };
@ -354,7 +437,8 @@
            // audio file
            //
-            const kMaxAudio_s = 120;
+            const kMaxAudio_s = 30*60;
            const kMaxRecording_s = 2*60;
            const kSampleRate = 16000;
            window.AudioContext = window.AudioContext || window.webkitAudioContext;
@ -423,7 +507,7 @@
                doRecording = false;
            }
-            // record up to kMaxAudio_s seconds of audio from the microphone
+            // record up to kMaxRecording_s seconds of audio from the microphone
            // check if doRecording is false every 1000 ms and stop recording if so
            // update progress information
            function startRecording() {
@ -479,9 +563,9 @@
                                        printTextarea('js: audio recorded, size: ' + audio.length);
                                        // truncate to first 30 seconds
-                                        if (audio.length > kMaxAudio_s*kSampleRate) {
+                                        if (audio.length > kMaxRecording_s*kSampleRate) {
-                                            audio = audio.slice(0, kMaxAudio_s*kSampleRate);
+                                            audio = audio.slice(0, kMaxRecording_s*kSampleRate);
-                                            printTextarea('js: truncated audio to first ' + kMaxAudio_s + ' seconds');
+                                            printTextarea('js: truncated audio to first ' + kMaxRecording_s + ' seconds');
                                        }
                                        setAudio(audio);
                                    });
@ -509,24 +593,31 @@
                        });
                    }
-                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxAudio_s) + '%';
+                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxRecording_s) + '%';
-                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxAudio_s).toFixed(0) + '%';
+                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxRecording_s).toFixed(0) + '%';
                }, 1000);
                printTextarea('js: recording ...');
                setTimeout(function() {
                    if (doRecording) {
-                        printTextarea('js: recording stopped after ' + kMaxAudio_s + ' seconds');
+                        printTextarea('js: recording stopped after ' + kMaxRecording_s + ' seconds');
                        stopRecording();
                    }
-                }, kMaxAudio_s*1000);
+                }, kMaxRecording_s*1000);
            }
            //
            // transcribe
            //
            var nthreads = 8;
            function changeThreads(value) {
                nthreads = value;
                document.getElementById('threads-value').innerHTML = nthreads;
            }
            function onProcess(translate) {
                if (!instance) {
                    instance = Module.init('whisper.bin');
@ -553,7 +644,7 @@
                    printTextarea('');
                    setTimeout(function() {
-                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, translate);
+                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, nthreads, translate);
                        console.log('js: full_default returned: ' + ret);
                        if (ret) {
                            printTextarea("js: whisper returned: " + ret);
--- a/extra/quantize-all.sh
+++ b/extra/quantize-all.sh
@ -0,0 +1,45 @@
 #!/bin/bash
 printf "Usage: $0 <upload>"
 if [ $# -ne 1 ]; then
    printf "\nError: Invalid number of arguments\n"
    exit 1
 fi
 qtype0="q5_0"
 qtype1="q5_1"
 upload="$1"
 cd `dirname $0`
 cd ../
 ./quantize ./models/ggml-tiny.en.bin   ./models/ggml-tiny.en-${qtype1}.bin ${qtype1}
 ./quantize ./models/ggml-tiny.bin      ./models/ggml-tiny-${qtype1}.bin    ${qtype1}
 ./quantize ./models/ggml-base.en.bin   ./models/ggml-base.en-${qtype1}.bin ${qtype1}
 ./quantize ./models/ggml-base.bin      ./models/ggml-base-${qtype1}.bin    ${qtype1}
 ./quantize ./models/ggml-small.en.bin  ./models/ggml-small.en-${qtype1}.bin ${qtype1}
 ./quantize ./models/ggml-small.bin     ./models/ggml-small-${qtype1}.bin    ${qtype1}
 ./quantize ./models/ggml-medium.en.bin ./models/ggml-medium.en-${qtype0}.bin ${qtype0}
 ./quantize ./models/ggml-medium.bin    ./models/ggml-medium-${qtype0}.bin    ${qtype0}
 ./quantize ./models/ggml-large.bin     ./models/ggml-large-${qtype0}.bin ${qtype0}
 if [ "$upload" == "1" ]; then
    scp ./models/ggml-tiny.en-${qtype1}.bin   root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny.en-${qtype1}.bin
    scp ./models/ggml-tiny-${qtype1}.bin      root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny-${qtype1}.bin
    scp ./models/ggml-base.en-${qtype1}.bin   root@linode0:/mnt/Data/ggml/ggml-model-whisper-base.en-${qtype1}.bin
    scp ./models/ggml-base-${qtype1}.bin      root@linode0:/mnt/Data/ggml/ggml-model-whisper-base-${qtype1}.bin
    scp ./models/ggml-small.en-${qtype1}.bin  root@linode0:/mnt/Data/ggml/ggml-model-whisper-small.en-${qtype1}.bin
    scp ./models/ggml-small-${qtype1}.bin     root@linode0:/mnt/Data/ggml/ggml-model-whisper-small-${qtype1}.bin
    scp ./models/ggml-medium.en-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium.en-${qtype0}.bin
    scp ./models/ggml-medium-${qtype0}.bin    root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium-${qtype0}.bin
    scp ./models/ggml-large-${qtype0}.bin     root@linode0:/mnt/Data/ggml/ggml-model-whisper-large-${qtype0}.bin
 fi
--- a/extra/sync-ggml.sh
+++ b/extra/sync-ggml.sh
@ -0,0 +1,10 @@
 #!/bin/bash
 cp -rpv ../ggml/src/ggml.c               ./ggml.c
 cp -rpv ../ggml/src/ggml-cuda.cu         ./ggml-cuda.cu
 cp -rpv ../ggml/src/ggml-cuda.h          ./ggml-cuda.h
 cp -rpv ../ggml/include/ggml/ggml.h      ./ggml.h
 cp -rpv ../ggml/examples/common.h        ./examples/common.h
 cp -rpv ../ggml/examples/common.cpp      ./examples/common.cpp
 cp -rpv ../ggml/examples/common-ggml.h   ./examples/common-ggml.h
 cp -rpv ../ggml/examples/common-ggml.cpp ./examples/common-ggml.cpp
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -0,0 +1,365 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <cuda_fp16.h>
 #include <atomic>
 #include "ggml-cuda.h"
 typedef uint16_t ggml_fp16_t;
 static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 #define QK4_0 32
 typedef struct {
    float   d;              // delta
    uint8_t qs[QK4_0 / 2];  // nibbles / quants
 } block_q4_0;
 static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
 #define QK4_1 32
 typedef struct {
    float   d;              // delta
    float   m;              // min
    uint8_t qs[QK4_1 / 2];  // nibbles / quants
 } block_q4_1;
 static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
 #define QK4_2 16
 typedef struct {
    __half  d;              // delta
    uint8_t qs[QK4_2 / 2];  // nibbles / quants
 } block_q4_2;
 static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
 #define QK5_0 32
 typedef struct {
    __half d;               // delta
    uint8_t qh[4];          // 5-th bit of quants
    uint8_t qs[QK5_0 / 2];  // nibbles / quants
 } block_q5_0;
 static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
 #define QK5_1 32
 typedef struct {
    __half d;               // delta
    __half m;               // min
    uint32_t qh;            // 5-th bit of quants
    uint8_t qs[QK5_1 / 2];  // nibbles / quants
 } block_q5_1;
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
 #define QK8_0 32
 typedef struct {
    float   d;              // delta
    int8_t  qs[QK8_0];      // quants
 } block_q8_0;
 static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
 static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
    const block_q4_0 * x = (const block_q4_0 *) vx;
    const int i = blockIdx.x;
    const float d = x[i].d;
    const uint8_t * pp = x[i].qs;
    for (int l = 0; l < QK4_0; l += 2) {
        const uint8_t vi = pp[l/2];
        const int8_t vi0 = vi & 0xf;
        const int8_t vi1 = vi >> 4;
        const float v0 = (vi0 - 8)*d;
        const float v1 = (vi1 - 8)*d;
        y[i*QK4_0 + l + 0] = v0;
        y[i*QK4_0 + l + 1] = v1;
    }
 }
 static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
    const block_q4_1 * x = (const block_q4_1 *) vx;
    const int i = blockIdx.x;
    const float d = x[i].d;
    const float m = x[i].m;
    const uint8_t * pp = x[i].qs;
    for (int l = 0; l < QK4_1; l += 2) {
        const uint8_t vi = pp[l/2];
        const int8_t vi0 = vi & 0xf;
        const int8_t vi1 = vi >> 4;
        const float v0 = vi0*d + m;
        const float v1 = vi1*d + m;
        y[i*QK4_1 + l + 0] = v0;
        y[i*QK4_1 + l + 1] = v1;
    }
 }
 static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
    const block_q4_2 * x = (const block_q4_2 *) vx;
    const int i = blockIdx.x;
    const float d = x[i].d;
    const uint8_t * pp = x[i].qs;
    for (int l = 0; l < QK4_2; l += 2) {
        const uint8_t vi = pp[l/2];
        const int8_t vi0 = vi & 0xf;
        const int8_t vi1 = vi >> 4;
        const float v0 = (vi0 - 8)*d;
        const float v1 = (vi1 - 8)*d;
        y[i*QK4_2 + l + 0] = v0;
        y[i*QK4_2 + l + 1] = v1;
    }
 }
 static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
    const block_q5_0 * x = (const block_q5_0 *) vx;
    const int i = blockIdx.x;
    const float d = x[i].d;
    const uint8_t * pp = x[i].qs;
    uint32_t qh;
    memcpy(&qh, x[i].qh, sizeof(qh));
    for (int l = 0; l < QK5_0; l += 2) {
        const uint8_t vi = pp[l/2];
        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
        const int8_t vi0 = ((vi & 0xf) | vh0);
        const int8_t vi1 = ((vi >>  4) | vh1);
        const float v0 = (vi0 - 16)*d;
        const float v1 = (vi1 - 16)*d;
        y[i*QK5_0 + l + 0] = v0;
        y[i*QK5_0 + l + 1] = v1;
    }
 }
 static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
    const block_q5_1 * x = (const block_q5_1 *) vx;
    const int i = blockIdx.x;
    const float d = x[i].d;
    const float m = x[i].m;
    const uint8_t * pp = x[i].qs;
    const uint32_t qh = x[i].qh;
    for (int l = 0; l < QK5_1; l += 2) {
        const uint8_t vi = pp[l/2];
        const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
        const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
        const int8_t vi0 = (vi & 0xf) | vh0;
        const int8_t vi1 = (vi >>  4) | vh1;
        const float v0 = vi0*d + m;
        const float v1 = vi1*d + m;
        y[i*QK5_1 + l + 0] = v0;
        y[i*QK5_1 + l + 1] = v1;
    }
 }
 static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
    const block_q8_0 * x = (const block_q8_0 *) vx;
    const int i = blockIdx.x;
    const float d = x[i].d;
    const int8_t * pp = x[i].qs;
    for (int l = 0; l < QK8_0; l++) {
        const int8_t vi = pp[l];
        y[i*QK8_0 + l] = vi*d;
    }
 }
 void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
    const int nb = k / QK4_0;
    dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
    const int nb = k / QK4_1;
    dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
 }
 void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
    const int nb = k / QK4_2;
    dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
 }
 void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
    const int nb = k / QK5_0;
    dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
    const int nb = k / QK5_1;
    dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
 }
 void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
    const int nb = k / QK8_0;
    dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
 }
 dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
    switch (type) {
        case GGML_TYPE_Q4_0:
            return dequantize_row_q4_0_cuda;
        case GGML_TYPE_Q4_1:
            return dequantize_row_q4_1_cuda;
        case GGML_TYPE_Q4_2:
            return dequantize_row_q4_2_cuda;
        case GGML_TYPE_Q5_0:
            return dequantize_row_q5_0_cuda;
        case GGML_TYPE_Q5_1:
            return dequantize_row_q5_1_cuda;
        case GGML_TYPE_Q8_0:
            return dequantize_row_q8_0_cuda;
        default:
            return nullptr;
    }
 }
 // buffer pool for cuda
 #define MAX_CUDA_BUFFERS 16
 struct scoped_spin_lock {
    std::atomic_flag& lock;
    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
        while (lock.test_and_set(std::memory_order_acquire)) {
            ; // spin
        }
    }
    ~scoped_spin_lock() {
        lock.clear(std::memory_order_release);
    }
    scoped_spin_lock(const scoped_spin_lock&) = delete;
    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
 };
 struct cuda_buffer {
    void * ptr = nullptr;
    size_t size = 0;
 };
 static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
 static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
 void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
        cuda_buffer& b = g_cuda_buffer_pool[i];
        if (b.size >= size && b.ptr != nullptr) {
            void * ptr = b.ptr;
            *actual_size = b.size;
            b.ptr = nullptr;
            b.size = 0;
            return ptr;
        }
    }
    void * ptr;
    CUDA_CHECK(cudaMalloc((void **) &ptr, size));
    *actual_size = size;
    return ptr;
 }
 void ggml_cuda_pool_free(void * ptr, size_t size) {
    scoped_spin_lock lock(g_cuda_pool_lock);
    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
        cuda_buffer& b = g_cuda_buffer_pool[i];
        if (b.ptr == nullptr) {
            b.ptr = ptr;
            b.size = size;
            return;
        }
    }
    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
    CUDA_CHECK(cudaFree(ptr));
 }
 cublasHandle_t g_cublasH = nullptr;
 cudaStream_t g_cudaStream = nullptr;
 cudaStream_t g_cudaStream2 = nullptr;
 cudaEvent_t g_cudaEvent = nullptr;
 void ggml_init_cublas() {
    if (g_cublasH == nullptr) {
        // create cublas handle, bind a stream
        CUBLAS_CHECK(cublasCreate(&g_cublasH));
        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
        CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
        // create additional stream and event for synchronization
        CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
        CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
        // configure logging to stdout
        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
    }
 }
 cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
    const uint64_t ne0 = src->ne[0];
    const uint64_t ne1 = src->ne[1];
    const uint64_t nb0 = src->nb[0];
    const uint64_t nb1 = src->nb[1];
    const uint64_t nb2 = src->nb[2];
    const uint64_t nb3 = src->nb[3];
    const enum ggml_type type = src->type;
    const size_t ts = ggml_type_size(type);
    const size_t bs = ggml_blck_size(type);
    const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
    if (nb0 == ts && nb1 == ts*ne0/bs) {
        return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream);
    } else if (nb0 == ts) {
        return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream);
    } else {
        for (uint64_t i1 = 0; i1 < ne1; i1++) {
            const void * rx = (const void *) ((const char *) x + i1*nb1);
            void * rd = (void *) ((char *) dst + i1*ts*ne0/bs);
            // pretend the row is a matrix with cols=1
            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
            if (r != cudaSuccess) return r;
        }
        return cudaSuccess;
    }
 }
 void * ggml_cuda_host_malloc(size_t size) {
    void * ptr;
    CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
    return ptr;
 }
 void ggml_cuda_host_free(void * ptr) {
    CUDA_CHECK(cudaFreeHost(ptr));
 }
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -0,0 +1,54 @@
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
 #include "ggml.h"
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #define CUDA_CHECK(err)                                                                 \
    do {                                                                                \
        cudaError_t err_ = (err);                                                       \
        if (err_ != cudaSuccess) {                                                      \
            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
                cudaGetErrorString(err_));                                              \
            exit(1);                                                                    \
        }                                                                               \
    } while (0)
 #define CUBLAS_CHECK(err)                                                               \
    do {                                                                                \
        cublasStatus_t err_ = (err);                                                    \
        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
            exit(1);                                                                    \
        }                                                                               \
    } while (0)
 extern cublasHandle_t g_cublasH;
 extern cudaStream_t g_cudaStream;
 extern cudaStream_t g_cudaStream2;
 extern cudaEvent_t g_cudaEvent;
 void   ggml_init_cublas(void);
 void * ggml_cuda_host_malloc(size_t size);
 void   ggml_cuda_host_free(void * ptr);
 void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
 void   ggml_cuda_pool_free(void * ptr, size_t size);
 void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
 cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
 typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
 dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
 #ifdef  __cplusplus
 }
 #endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -169,14 +169,27 @@
 //
 //
-#ifdef  __cplusplus
+#ifdef GGML_SHARED
-extern "C" {
+#    if defined(_WIN32) && !defined(__MINGW32__)
 #        ifdef GGML_BUILD
 #            define GGML_API __declspec(dllexport)
 #        else
 #            define GGML_API __declspec(dllimport)
 #        endif
 #    else
 #        define GGML_API __attribute__ ((visibility ("default")))
 #    endif
 #else
 #    define GGML_API
 #endif
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
 #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
 #define GGML_FILE_VERSION 1
 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
 #define GGML_MAX_PARAMS        16
@ -184,6 +197,10 @@ extern "C" {
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4
 #ifdef  __cplusplus
 extern "C" {
 #endif
 #ifdef __ARM_NEON
    // we use the built-in 16-bit float type
    typedef __fp16 ggml_fp16_t;
@ -192,24 +209,43 @@ typedef uint16_t ggml_fp16_t;
 #endif
    // convert FP16 <-> FP32
-float       ggml_fp16_to_fp32(ggml_fp16_t x);
+    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
-ggml_fp16_t ggml_fp32_to_fp16(float x);
+    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
    struct ggml_object;
    struct ggml_context;
    enum ggml_type {
    // explicitly numbered values are used in llama.cpp files
        GGML_TYPE_F32  = 0,
        GGML_TYPE_F16  = 1,
        GGML_TYPE_Q4_0 = 2,
        GGML_TYPE_Q4_1 = 3,
        GGML_TYPE_Q4_2 = 4,
        // GGML_TYPE_Q4_3 (5) support has been removed
        GGML_TYPE_Q5_0 = 6,
        GGML_TYPE_Q5_1 = 7,
        GGML_TYPE_Q8_0 = 8,
        GGML_TYPE_Q8_1 = 9,
        GGML_TYPE_I8,
        GGML_TYPE_I16,
        GGML_TYPE_I32,
        GGML_TYPE_COUNT,
    };
    // model file types
    enum ggml_ftype {
        GGML_FTYPE_UNKNOWN     = -1,
        GGML_FTYPE_ALL_F32     = 0,
        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
    };
    // available tensor operations:
    enum ggml_op {
        GGML_OP_NONE = 0,
@ -247,6 +283,7 @@ enum ggml_op {
        GGML_OP_DIAG_MASK_INF,
        GGML_OP_SOFT_MAX,
        GGML_OP_ROPE,
        GGML_OP_ALIBI,
        GGML_OP_CONV_1D_1S,
        GGML_OP_CONV_1D_2S,
@ -338,56 +375,66 @@ struct ggml_init_params {
        bool   no_alloc;   // don't allocate memory for the tensor data
    };
-void    ggml_time_init(void); // call this once at the beginning of the program
+    // misc
 int64_t ggml_time_ms(void);
 int64_t ggml_time_us(void);
 int64_t ggml_cycles(void);
 int64_t ggml_cycles_per_ms(void);
-void ggml_print_object (const struct ggml_object * obj);
+    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
-void ggml_print_objects(const struct ggml_context * ctx);
+    GGML_API int64_t ggml_time_ms(void);
    GGML_API int64_t ggml_time_us(void);
    GGML_API int64_t ggml_cycles(void);
    GGML_API int64_t ggml_cycles_per_ms(void);
-int64_t ggml_nelements(const struct ggml_tensor * tensor);
+    GGML_API void    ggml_print_object (const struct ggml_object * obj);
-size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
+    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);
-int    ggml_blck_size (enum ggml_type type);
+    GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
-size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
+    GGML_API size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
 float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
-size_t ggml_element_size(const struct ggml_tensor * tensor);
+    GGML_API int     ggml_blck_size (enum ggml_type type);
    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
-struct ggml_context * ggml_init(struct ggml_init_params params);
+    GGML_API const char * ggml_type_name(enum ggml_type type);
 void ggml_free(struct ggml_context * ctx);
-size_t ggml_used_mem(const struct ggml_context * ctx);
+    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);
-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API bool    ggml_is_quantized(enum ggml_type type);
-struct ggml_tensor * ggml_new_tensor(
+    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
    // main
    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
    GGML_API void    ggml_free(struct ggml_context * ctx);
    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
    GGML_API struct ggml_tensor * ggml_new_tensor(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int    n_dims,
            const int64_t *ne);
-struct ggml_tensor * ggml_new_tensor_1d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_1d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0);
-struct ggml_tensor * ggml_new_tensor_2d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_2d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0,
            int64_t ne1);
-struct ggml_tensor * ggml_new_tensor_3d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_3d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0,
            int64_t ne1,
            int64_t ne2);
-struct ggml_tensor * ggml_new_tensor_4d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_4d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0,
@ -395,122 +442,127 @@ struct ggml_tensor * ggml_new_tensor_4d(
            int64_t ne2,
            int64_t ne3);
-struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
-struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
-struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
-struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
-struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
-int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
-float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
- void * ggml_get_data    (const struct ggml_tensor * tensor);
+    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
-float * ggml_get_data_f32(const struct ggml_tensor * tensor);
+    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
    //
    // operations on tensors with backpropagation
    //
-struct ggml_tensor * ggml_dup(
+    GGML_API struct ggml_tensor * ggml_dup(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_add(
+    GGML_API struct ggml_tensor * ggml_add(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_sub(
+    GGML_API struct ggml_tensor * ggml_add_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_mul(
+    GGML_API struct ggml_tensor * ggml_sub(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_div(
+    GGML_API struct ggml_tensor * ggml_mul(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_sqr(
+    GGML_API struct ggml_tensor * ggml_div(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    GGML_API struct ggml_tensor * ggml_sqr(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_sqrt(
+    GGML_API struct ggml_tensor * ggml_sqrt(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // return scalar
    // TODO: compute sum along rows
-struct ggml_tensor * ggml_sum(
+    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // mean along rows
-struct ggml_tensor * ggml_mean(
+    GGML_API struct ggml_tensor * ggml_mean(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // if a is the same shape as b, and a is not parameter, return a
    // otherwise, return a new tensor: repeat(a) to fit in b
-struct ggml_tensor * ggml_repeat(
+    GGML_API struct ggml_tensor * ggml_repeat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_abs(
+    GGML_API struct ggml_tensor * ggml_abs(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_sgn(
+    GGML_API struct ggml_tensor * ggml_sgn(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_neg(
+    GGML_API struct ggml_tensor * ggml_neg(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_step(
+    GGML_API struct ggml_tensor * ggml_step(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_relu(
+    GGML_API struct ggml_tensor * ggml_relu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // TODO: double-check this computation is correct
-struct ggml_tensor * ggml_gelu(
+    GGML_API struct ggml_tensor * ggml_gelu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_silu(
+    GGML_API struct ggml_tensor * ggml_silu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // normalize along rows
    // TODO: eps is hardcoded to 1e-5 for now
-struct ggml_tensor * ggml_norm(
+    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_rms_norm(
+    GGML_API struct ggml_tensor * ggml_rms_norm(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // A: m rows, n columns
    // B: p rows, n columns (i.e. we transpose it internally)
    // result is m columns, p rows
-struct ggml_tensor * ggml_mul_mat(
+    GGML_API struct ggml_tensor * ggml_mul_mat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
@ -520,32 +572,32 @@ struct ggml_tensor * ggml_mul_mat(
    //
    // in-place, returns view(a)
-struct ggml_tensor * ggml_scale(
+    GGML_API struct ggml_tensor * ggml_scale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // a -> b, return view(b)
-struct ggml_tensor * ggml_cpy(
+    GGML_API struct ggml_tensor * ggml_cpy(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // make contiguous
-struct ggml_tensor * ggml_cont(
+    GGML_API struct ggml_tensor * ggml_cont(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // return view(a), b specifies the new shape
    // TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape(
+    GGML_API struct ggml_tensor * ggml_reshape(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // return view(a)
    // TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape_2d(
+    GGML_API struct ggml_tensor * ggml_reshape_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -553,7 +605,7 @@ struct ggml_tensor * ggml_reshape_2d(
    // return view(a)
    // TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape_3d(
+    GGML_API struct ggml_tensor * ggml_reshape_3d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -561,13 +613,13 @@ struct ggml_tensor * ggml_reshape_3d(
            int64_t               ne2);
    // offset in bytes
-struct ggml_tensor * ggml_view_1d(
+    GGML_API struct ggml_tensor * ggml_view_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
            size_t                offset);
-struct ggml_tensor * ggml_view_2d(
+    GGML_API struct ggml_tensor * ggml_view_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -575,7 +627,7 @@ struct ggml_tensor * ggml_view_2d(
            size_t                nb1, // row stride in bytes
            size_t                offset);
-struct ggml_tensor * ggml_view_3d(
+    GGML_API struct ggml_tensor * ggml_view_3d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -585,7 +637,7 @@ struct ggml_tensor * ggml_view_3d(
            size_t                nb2, // slice stride in bytes
            size_t                offset);
-struct ggml_tensor * ggml_permute(
+    GGML_API struct ggml_tensor * ggml_permute(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   axis0,
@ -594,60 +646,69 @@ struct ggml_tensor * ggml_permute(
            int                   axis3);
    // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
-struct ggml_tensor * ggml_transpose(
+    GGML_API struct ggml_tensor * ggml_transpose(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
-struct ggml_tensor * ggml_get_rows(
+    GGML_API struct ggml_tensor * ggml_get_rows(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
    // set elements above the diagonal to -INF
    // in-place, returns view(a)
-struct ggml_tensor * ggml_diag_mask_inf(
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past);
    // in-place, returns view(a)
-struct ggml_tensor * ggml_soft_max(
+    GGML_API struct ggml_tensor * ggml_soft_max(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // rotary position embedding
    // in-place, returns view(a)
-// if mode == 1, skip n_past elements
+    // if mode & 1 == 1, skip n_past elements
    // if mode & 2 == 1, GPT-NeoX style
    // TODO: avoid creating a new tensor every time
-struct ggml_tensor * ggml_rope(
+    GGML_API struct ggml_tensor * ggml_rope(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
            int                   n_dims,
            int                   mode);
    // alibi position embedding
    // in-place, returns view(a)
    struct ggml_tensor * ggml_alibi(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
            int                   n_head);
    // padding = 1
    // TODO: we don't support extra parameters for now
    //       that's why we are hard-coding the stride, padding, and dilation
    //       not great ..
-struct ggml_tensor * ggml_conv_1d_1s(
+    GGML_API struct ggml_tensor * ggml_conv_1d_1s(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_conv_1d_2s(
+    GGML_API struct ggml_tensor * ggml_conv_1d_2s(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
-struct ggml_tensor * ggml_flash_attn(
+    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
            struct ggml_tensor  * k,
            struct ggml_tensor  * v,
            bool                  masked);
-struct ggml_tensor * ggml_flash_ff(
+    GGML_API struct ggml_tensor * ggml_flash_ff(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b0,
@ -659,12 +720,12 @@ struct ggml_tensor * ggml_flash_ff(
    typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
-struct ggml_tensor * ggml_map_unary_f32(
+    GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
            const  ggml_unary_op_f32_t fun);
-struct ggml_tensor * ggml_map_binary_f32(
+    GGML_API struct ggml_tensor * ggml_map_binary_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
@ -674,23 +735,23 @@ struct ggml_tensor * ggml_map_binary_f32(
    // automatic differentiation
    //
-void ggml_set_param(
+    GGML_API void ggml_set_param(
            struct ggml_context * ctx,
            struct ggml_tensor * tensor);
-void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
-struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
+    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
+    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);
    // print info and performance information for the graph
-void ggml_graph_print(const struct ggml_cgraph * cgraph);
+    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
    // dump the graph into a file using the dot format
-void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
    //
    // optimization
@ -783,10 +844,10 @@ struct ggml_opt_params {
        } lbfgs;
    };
-struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
+    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
    // optimize the function defined by the tensor f
-enum ggml_opt_result ggml_opt(
+    GGML_API enum ggml_opt_result ggml_opt(
            struct ggml_context * ctx,
            struct ggml_opt_params params,
            struct ggml_tensor * f);
@ -795,26 +856,36 @@ enum ggml_opt_result ggml_opt(
    // quantization
    //
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
    //
    // system info
    //
-int ggml_cpu_has_avx(void);
+    GGML_API int ggml_cpu_has_avx        (void);
-int ggml_cpu_has_avx2(void);
+    GGML_API int ggml_cpu_has_avx2       (void);
-int ggml_cpu_has_avx512(void);
+    GGML_API int ggml_cpu_has_avx512     (void);
-int ggml_cpu_has_fma(void);
+    GGML_API int ggml_cpu_has_avx512_vbmi(void);
-int ggml_cpu_has_neon(void);
+    GGML_API int ggml_cpu_has_avx512_vnni(void);
-int ggml_cpu_has_arm_fma(void);
+    GGML_API int ggml_cpu_has_fma        (void);
-int ggml_cpu_has_f16c(void);
+    GGML_API int ggml_cpu_has_neon       (void);
-int ggml_cpu_has_fp16_va(void);
+    GGML_API int ggml_cpu_has_arm_fma    (void);
-int ggml_cpu_has_wasm_simd(void);
+    GGML_API int ggml_cpu_has_f16c       (void);
-int ggml_cpu_has_blas(void);
+    GGML_API int ggml_cpu_has_fp16_va    (void);
-int ggml_cpu_has_sse3(void);
+    GGML_API int ggml_cpu_has_wasm_simd  (void);
-int ggml_cpu_has_vsx(void);
+    GGML_API int ggml_cpu_has_blas       (void);
-
+    GGML_API int ggml_cpu_has_cublas     (void);
    GGML_API int ggml_cpu_has_clblast    (void);
    GGML_API int ggml_cpu_has_gpublas    (void);
    GGML_API int ggml_cpu_has_sse3       (void);
    GGML_API int ggml_cpu_has_vsx        (void);
    //
    // Internal types and functions exposed for tests and benchmarks
@ -834,7 +905,9 @@ typedef struct {
        dequantize_row_q_t dequantize_row_q;
        quantize_row_q_t   quantize_row_q;
        quantize_row_q_t   quantize_row_q_reference;
        quantize_row_q_t   quantize_row_q_dot;
        vec_dot_q_t        vec_dot_q;
        enum ggml_type     vec_dot_type;
    } quantize_fns_t;
    quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@ -23,6 +23,7 @@ import json
 import code
 import torch
 import numpy as np
 from pathlib import Path
 from transformers import WhisperForConditionalGeneration
@ -75,16 +76,13 @@ if len(sys.argv) < 4:
    print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n")
    sys.exit(1)
-dir_model   = sys.argv[1]
+dir_model   = Path(sys.argv[1])
-dir_whisper = sys.argv[2]
+dir_whisper = Path(sys.argv[2])
-dir_out     = sys.argv[3]
+dir_out     = Path(sys.argv[3])
-with open(dir_model + "/vocab.json", "r", encoding="utf8") as f:
+encoder = json.load((dir_model / "vocab.json").open("r", encoding="utf8"))
-    encoder = json.load(f)
+encoder_added = json.load((dir_model / "added_tokens.json").open( "r", encoding="utf8"))
-with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f:
+hparams = json.load((dir_model / "config.json").open("r", encoding="utf8") )
    encoder_added = json.load(f)
 with open(dir_model + "/config.json", "r", encoding="utf8") as f:
    hparams = json.load(f)
 model = WhisperForConditionalGeneration.from_pretrained(dir_model)
@ -96,16 +94,15 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
 dir_tokenizer = dir_model
-fname_out = dir_out + "/ggml-model.bin"
+fname_out = dir_out / "ggml-model.bin"
-with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
+tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8"))
    tokens = json.load(f)
 # use 16-bit or 32-bit floats
 use_f16 = True
 if len(sys.argv) > 4:
    use_f16 = False
-    fname_out = dir_out + "/ggml-model-f32.bin"
+    fname_out = dir_out / "ggml-model-f32.bin"
 fout = open(fname_out, "wb")
@ -171,10 +168,9 @@ for name in list_vars.keys():
    data = data.astype(np.float16)
    # reshape conv bias from [n] to [n, 1]
-    if name == "encoder.conv1.bias" or \
+    if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
       name == "encoder.conv2.bias":
        data = data.reshape(data.shape[0], 1)
-        print("  Reshaped variable: " + name + " to shape: ", data.shape)
+        print("  Reshaped variable: " , name , " to shape: ", data.shape)
    n_dims = len(data.shape)
    print(name, n_dims, data.shape)
@ -182,7 +178,7 @@ for name in list_vars.keys():
    # looks like the whisper models are in f16 by default
    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 1;
+    ftype = 1
    if use_f16:
        if n_dims < 2 or \
                name == "encoder.conv1.bias"   or \
@ -197,16 +193,16 @@ for name in list_vars.keys():
        ftype = 0
    # header
-    str = name.encode('utf-8')
+    str_ = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    fout.write(struct.pack("iii", n_dims, len(str_), ftype))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
+    fout.write(str_)
    # data
    data.tofile(fout)
 fout.close()
-print("Done. Output file: " + fname_out)
+print("Done. Output file: " , fname_out)
 print("")
--- a/models/convert-pt-to-ggml.py
+++ b/models/convert-pt-to-ggml.py
@ -40,7 +40,7 @@ import code
 import torch
 import numpy as np
 import base64
-
+from pathlib import Path
 #from transformers import GPTJForCausalLM
 #from transformers import GPT2TokenizerFast
@ -194,17 +194,17 @@ if len(sys.argv) < 4:
    print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
    sys.exit(1)
-fname_inp   = sys.argv[1]
+fname_inp   = Path(sys.argv[1])
-dir_whisper = sys.argv[2]
+dir_whisper = Path(sys.argv[2])
-dir_out     = sys.argv[3]
+dir_out     = Path(sys.argv[3])
 # try to load PyTorch binary data
 try:
    model_bytes = open(fname_inp, "rb").read()
    with io.BytesIO(model_bytes) as fp:
        checkpoint = torch.load(fp, map_location="cpu")
-except:
+except Exception:
-    print("Error: failed to load PyTorch model file: %s" % fname_inp)
+    print("Error: failed to load PyTorch model file:" , fname_inp)
    sys.exit(1)
 hparams = checkpoint["dims"]
@ -218,17 +218,17 @@ list_vars = checkpoint["model_state_dict"]
 # load mel filters
 n_mels = hparams["n_mels"]
-with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
+with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
    filters = torch.from_numpy(f[f"mel_{n_mels}"])
    #print (filters)
 #code.interact(local=locals())
 multilingual = hparams["n_vocab"] == 51865
-tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
+tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
 # output in the same directory as the model
-fname_out = dir_out + "/ggml-model.bin"
+fname_out = dir_out / "ggml-model.bin"
 with open(tokenizer, "rb") as f:
    contents = f.read()
@ -238,9 +238,9 @@ with open(tokenizer, "rb") as f:
 use_f16 = True
 if len(sys.argv) > 4:
    use_f16 = False
-    fname_out = dir_out + "/ggml-model-f32.bin"
+    fname_out = dir_out / "ggml-model-f32.bin"
-fout = open(fname_out, "wb")
+fout = fname_out.open("wb")
 fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
 fout.write(struct.pack("i", hparams["n_vocab"]))
@ -273,20 +273,19 @@ for key in tokens:
 for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
+    print("Processing variable: " , name ,  " with shape: ", data.shape)
    # reshape conv bias from [n] to [n, 1]
-    if name == "encoder.conv1.bias" or \
+    if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
       name == "encoder.conv2.bias":
        data = data.reshape(data.shape[0], 1)
-        print("  Reshaped variable: " + name + " to shape: ", data.shape)
+        print(f"  Reshaped variable: {name} to shape: ", data.shape)
-    n_dims = len(data.shape);
+    n_dims = len(data.shape)
    # looks like the whisper models are in f16 by default
    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 1;
+    ftype = 1
    if use_f16:
        if n_dims < 2 or \
                name == "encoder.conv1.bias"   or \
@ -307,16 +306,16 @@ for name in list_vars.keys():
    #        data = data.transpose()
    # header
-    str = name.encode('utf-8')
+    str_ = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    fout.write(struct.pack("iii", n_dims, len(str_), ftype))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
+    fout.write(str_)
    # data
    data.tofile(fout)
 fout.close()
-print("Done. Output file: " + fname_out)
+print("Done. Output file: " , fname_out)
 print("")
--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@ -20,7 +20,7 @@ def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
    """
    for k in state_dict:
        is_attention = all(substr in k for substr in ['attn', '.weight'])
-        is_mlp = any([k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight']])
+        is_mlp = any(k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight'])
        if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
            state_dict[k] = state_dict[k][:, :, None, None]
@ -42,11 +42,10 @@ class LayerNormANE(LayerNormANEBase):
 class MultiHeadAttentionANE(MultiHeadAttention):
    def __init__(self, n_state: int, n_head: int):
        super().__init__(n_state, n_head)
-
+        self.query =  nn.Conv2d(n_state, n_state, kernel_size=1)
-        setattr(self, 'query', nn.Conv2d(n_state, n_state, kernel_size=1))
+        self.key = nn.Conv2d(n_state, n_state, kernel_size=1, bias=False)
-        setattr(self, 'key', nn.Conv2d(n_state, n_state, kernel_size=1, bias=False))
+        self.value = nn.Conv2d(n_state, n_state, kernel_size=1)
-        setattr(self, 'value', nn.Conv2d(n_state, n_state, kernel_size=1))
+        self.out = nn.Conv2d(n_state, n_state, kernel_size=1)
        setattr(self, 'out', nn.Conv2d(n_state, n_state, kernel_size=1))
    def forward(self,
                x: Tensor,
@ -104,30 +103,28 @@ class MultiHeadAttentionANE(MultiHeadAttention):
 class ResidualAttentionBlockANE(ResidualAttentionBlock):
    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
        super().__init__(n_state, n_head, cross_attention)
-
+        self.attn =  MultiHeadAttentionANE(n_state, n_head)
-        setattr(self, 'attn', MultiHeadAttentionANE(n_state, n_head))
+        self.attn_ln = LayerNormANE(n_state)
-        setattr(self, 'attn_ln', LayerNormANE(n_state))
+        self.cross_attn =  MultiHeadAttentionANE(n_state, n_head) if cross_attention else None
-
+        self.cross_attn_ln =  LayerNormANE(n_state) if cross_attention else None
        setattr(self, 'cross_attn', MultiHeadAttentionANE(n_state, n_head) if cross_attention else None)
        setattr(self, 'cross_attn_ln', LayerNormANE(n_state) if cross_attention else None)
        n_mlp = n_state * 4
-        setattr(self, 'mlp', nn.Sequential(
+        self.mlp =  nn.Sequential(
            nn.Conv2d(n_state, n_mlp, kernel_size=1),
            nn.GELU(),
            nn.Conv2d(n_mlp, n_state, kernel_size=1)
-        ))
+        )
-        setattr(self, 'mlp_ln', LayerNormANE(n_state))
+        self.mlp_ln = LayerNormANE(n_state)
 class AudioEncoderANE(AudioEncoder):
    def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
        super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)
-        setattr(self, 'blocks', nn.ModuleList(
+        self.blocks = nn.ModuleList(
            [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
-        ))
+        )
-        setattr(self, 'ln_post', LayerNormANE(n_state))
+        self.ln_post = LayerNormANE(n_state)
    def forward(self, x: Tensor):
        """
@ -168,10 +165,10 @@ class TextDecoderANE(TextDecoder):
    def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
        super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)
-        setattr(self, 'blocks', nn.ModuleList(
+        self.blocks= nn.ModuleList(
            [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
-        ))
+        )
-        setattr(self, 'ln', LayerNormANE(n_state))
+        self.ln= LayerNormANE(n_state)
    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
        """
@ -213,20 +210,20 @@ class WhisperANE(Whisper):
    def __init__(self, dims: ModelDimensions):
        super().__init__(dims)
-        setattr(self, 'encoder', AudioEncoderANE(
+        self.encoder = AudioEncoderANE(
            self.dims.n_mels,
            self.dims.n_audio_ctx,
            self.dims.n_audio_state,
            self.dims.n_audio_head,
            self.dims.n_audio_layer,
-        ))
+        )
-        setattr(self, 'decoder', TextDecoderANE(
+        self.decoder = TextDecoderANE(
            self.dims.n_vocab,
            self.dims.n_text_ctx,
            self.dims.n_text_state,
            self.dims.n_text_head,
            self.dims.n_text_layer,
-        ))
+        )
        self._register_load_state_dict_pre_hook(linear_to_conv2d_map)
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@ -13,7 +13,7 @@
 #
 # Usage:
 #
-#   ./tests/run-tests.sh <model_name>
+#   ./tests/run-tests.sh <model_name> [threads]
 #
 cd `dirname $0`
@ -32,7 +32,7 @@ function list_models {
 }
 if [ $# -eq 0 ]; then
-    printf "Usage: $0 [model]\n\n"
+    printf "Usage: $0 [model] [threads]\n\n"
    printf "No model specified. Aborting\n"
    list_models
    exit 1
@ -41,6 +41,11 @@ fi
 model=$1
 main="../main"
 threads=""
 if [ $# -eq 2 ]; then
    threads="-t $2"
 fi
 if [ ! -f ../models/ggml-$model.bin ]; then
    printf "Model $model not found. Aborting\n"
    list_models
@ -105,7 +110,7 @@ function run_lang() {
            fi
        fi
-        $main -m ../models/ggml-$model.bin -f $fname_dst -l $lang -otxt 2> /dev/null
+        $main -m ../models/ggml-$model.bin $threads -f $fname_dst -l $lang -otxt 2> /dev/null
        git diff --no-index --word-diff=color --word-diff-regex=. $lang-$i-ref.txt $fname_dst.txt
--- a/whisper.cpp
+++ b/whisper.cpp
@ -1,4 +1,3 @@
 #define WHISPER_BUILD
 #include "whisper.h"
 #if WHISPER_USE_COREML
 #include "coreml/whisper-encoder.h"
@ -102,7 +101,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
 #define WHISPER_PRINT_DEBUG(...)
 #endif
-#define WHISPER_USE_FLASH_ATTN
+//#define WHISPER_USE_FLASH_ATTN
 //#define WHISPER_USE_FLASH_FF
 #define WHISPER_MAX_DECODERS 16
@ -224,11 +223,11 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
 static const size_t MB = 1ull*1024*1024;
 static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
-    { MODEL_TINY,     14ull*MB },
+    { MODEL_TINY,     62ull*MB },
-    { MODEL_BASE,     18ull*MB },
+    { MODEL_BASE,     80ull*MB },
-    { MODEL_SMALL,    28ull*MB },
+    { MODEL_SMALL,   120ull*MB },
-    { MODEL_MEDIUM,   36ull*MB },
+    { MODEL_MEDIUM,  158ull*MB },
-    { MODEL_LARGE,    44ull*MB },
+    { MODEL_LARGE,   198ull*MB },
 };
 static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
@ -255,12 +254,70 @@ static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
    { MODEL_LARGE,     9ull*MB },
 };
-static const std::map<e_model, size_t> MEM_REQ_MODEL = {
+static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
    { GGML_TYPE_F32,
        {
            { MODEL_TINY,     74ull*MB },
            { MODEL_BASE,    142ull*MB },
            { MODEL_SMALL,   466ull*MB },
            { MODEL_MEDIUM, 1464ull*MB },
            { MODEL_LARGE,  2952ull*MB },
        },
    },
    { GGML_TYPE_F16,
        {
            { MODEL_TINY,     74ull*MB },
            { MODEL_BASE,    142ull*MB },
            { MODEL_SMALL,   466ull*MB },
            { MODEL_MEDIUM, 1464ull*MB },
            { MODEL_LARGE,  2952ull*MB },
        },
    },
    { GGML_TYPE_Q4_0,
        {
            { MODEL_TINY,     26ull*MB },
            { MODEL_BASE,     50ull*MB },
            { MODEL_SMALL,   154ull*MB },
            { MODEL_MEDIUM,  470ull*MB },
            { MODEL_LARGE,   940ull*MB },
        },
    },
    { GGML_TYPE_Q4_1,
        {
            { MODEL_TINY,     31ull*MB },
            { MODEL_BASE,     57ull*MB },
            { MODEL_SMALL,   181ull*MB },
            { MODEL_MEDIUM,  559ull*MB },
            { MODEL_LARGE,  1122ull*MB },
        },
    },
    { GGML_TYPE_Q4_2,
        {
            { MODEL_TINY,     26ull*MB },
            { MODEL_BASE,     50ull*MB },
            { MODEL_SMALL,   154ull*MB },
            { MODEL_MEDIUM,  470ull*MB },
            { MODEL_LARGE,   940ull*MB },
        },
    },
    { GGML_TYPE_Q5_0, // TODO: fix
        {
            { MODEL_TINY,     31ull*MB },
            { MODEL_BASE,     57ull*MB },
            { MODEL_SMALL,   181ull*MB },
            { MODEL_MEDIUM,  559ull*MB },
            { MODEL_LARGE,  1122ull*MB },
        },
    },
    { GGML_TYPE_Q5_1,
        {
            { MODEL_TINY,     31ull*MB },
            { MODEL_BASE,     57ull*MB },
            { MODEL_SMALL,   181ull*MB },
            { MODEL_MEDIUM,  559ull*MB },
            { MODEL_LARGE,  1122ull*MB },
        },
    },
 };
 static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
@ -280,11 +337,11 @@ static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
 };
 static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
-    { MODEL_TINY,      6ull*MB },
+    { MODEL_TINY,     30ull*MB },
-    { MODEL_BASE,      8ull*MB },
+    { MODEL_BASE,     38ull*MB },
-    { MODEL_SMALL,    13ull*MB },
+    { MODEL_SMALL,    56ull*MB },
-    { MODEL_MEDIUM,   22ull*MB },
+    { MODEL_MEDIUM,   74ull*MB },
-    { MODEL_LARGE,    33ull*MB },
+    { MODEL_LARGE,    94ull*MB },
 };
 static const std::map<e_model, size_t> MEM_REQ_DECODE = {
@ -370,7 +427,7 @@ struct whisper_hparams {
    int32_t n_text_head   = 6;
    int32_t n_text_layer  = 4;
    int32_t n_mels        = 80;
-    int32_t f16           = 1;
+    int32_t ftype         = 1;
 };
 // audio encoding layer
@ -592,7 +649,7 @@ struct whisper_state {
    std::string path_model; // populated by whisper_init_from_file()
 #ifdef WHISPER_USE_COREML
-    whisper_coreml_context * ctx_coreml;
+    whisper_coreml_context * ctx_coreml = nullptr;
 #endif
    // [EXPERIMENTAL] token-level timestamps data
@ -640,7 +697,8 @@ struct whisper_context {
    int64_t t_load_us  = 0;
    int64_t t_start_us = 0;
-    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16)
+    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
    ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
    whisper_model model;
    whisper_vocab vocab;
@ -697,7 +755,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
    const ggml_type wtype = cache.k->type;
    WHISPER_ASSERT(wtype == cache.v->type);
-    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
+    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));
    struct ggml_init_params params = {
        /*.mem_size   =*/ cache.buf.size(),
@ -770,7 +828,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        read_safe(loader, hparams.n_text_head);
        read_safe(loader, hparams.n_text_layer);
        read_safe(loader, hparams.n_mels);
-        read_safe(loader, hparams.f16);
+        read_safe(loader, hparams.ftype);
        assert(hparams.n_text_state == hparams.n_audio_state);
@ -794,11 +852,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
            model.type = e_model::MODEL_LARGE;
        }
-        // for the big tensors, we have the option to store the data in 16-bit floats
+        // for the big tensors, we have the option to store the data in 16-bit floats or quantized
        // in order to save memory and also to speed up the computation
-        wctx.wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+        wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
        if (wctx.wtype == GGML_TYPE_COUNT) {
            fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
            return false;
        }
-        const size_t scale = model.hparams.f16 ? 1 : 2;
+        const size_t scale = model.hparams.ftype ? 1 : 2;
        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
@ -810,7 +872,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
-        fprintf(stderr, "%s: f16           = %d\n", __func__, hparams.f16);
+        fprintf(stderr, "%s: ftype         = %d\n", __func__, model.hparams.ftype);
        fprintf(stderr, "%s: type          = %d\n", __func__, model.type);
        // print memory requirements
@ -821,7 +883,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                     MEM_REQ_SCRATCH1.at(model.type) +
                     MEM_REQ_SCRATCH2.at(model.type) +
                     MEM_REQ_SCRATCH3.at(model.type) +
-                scale*MEM_REQ_MODEL.at   (model.type) +
+                scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) +
                scale*MEM_REQ_KV_CROSS.at(model.type) +
                scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
@ -837,7 +899,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        // always have at least one decoder
        wctx.model.buf = new std::vector<uint8_t>();
-        wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(model.type));
+        wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type));
        // we skip initialization of the state until it is needed
        // because it might be that state will always be provided externally.
@ -928,6 +990,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
    size_t ctx_size = 0;
    const ggml_type wtype = wctx.wtype;
    const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
    {
        const auto & hparams = model.hparams;
@ -946,92 +1009,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        // encoder
        {
-            ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe;
+            ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;
-            ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype);         // e_conv_1_w
+            ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype);         // e_conv_1_w
-            ctx_size +=          n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_1_b
+            ctx_size +=          n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b
-            ctx_size += 3*n_audio_state*n_audio_state*ggml_type_size(wtype);         // e_conv_2_w
+            ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype);         // e_conv_2_w
-            ctx_size +=                 n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_2_b
+            ctx_size +=                 n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b
-            ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_w;
+            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
-            ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_b;
+            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
        }
        // decoder
        {
-            ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe;
+            ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;
-            ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te;
+            ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;
-            ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_w;
+            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
-            ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_b;
+            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
        }
        // encoder layers
        {
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
-            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype));         // mlp_0_w
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_0_w
-            ctx_size += n_audio_layer*(              4*n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
+            ctx_size += n_audio_layer*(              4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
-            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype));         // mlp_1_w
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_1_w
-            ctx_size += n_audio_layer*(                n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
+            ctx_size += n_audio_layer*(                n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_q_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_q_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_k_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_v_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_v_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_ln_1_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_ln_1_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
        }
        // decoder layers
        {
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
-            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype));         // mlp_0_w
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_0_w
-            ctx_size += n_text_layer*(             4*n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
+            ctx_size += n_text_layer*(             4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
-            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype));         // mlp_1_w
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_1_w
-            ctx_size += n_text_layer*(               n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
+            ctx_size += n_text_layer*(               n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_q_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_q_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_k_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_v_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_v_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_ln_1_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_ln_1_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
                                                                                                //
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_b
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_q_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_q_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_q_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_k_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_v_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_v_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_v_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_ln_1_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_ln_1_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
        }
        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
@ -1079,10 +1142,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        {
            model.e_pe       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
-            model.e_conv_1_w = ggml_new_tensor_3d(ctx, wtype,         3, n_mels, n_audio_state);
+            model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype,         3, n_mels, n_audio_state);
            model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
-            model.e_conv_2_w = ggml_new_tensor_3d(ctx, wtype,         3, n_audio_state, n_audio_state);
+            model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype,         3, n_audio_state, n_audio_state);
            model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
            model.e_ln_w     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
@ -1259,11 +1322,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;
            read_safe(loader, n_dims);
            read_safe(loader, length);
-            read_safe(loader, ftype);
+            read_safe(loader, ttype);
            if (loader->eof(loader->context)) {
                break;
@ -1298,9 +1361,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                return false;
            }
-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@ -1309,7 +1372,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
            loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
            BYTESWAP_TENSOR(tensor);
-            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
            total_size += ggml_nbytes(tensor);
            model.n_loaded++;
        }
@ -1385,9 +1448,15 @@ static bool whisper_encode_internal(
        }
    }
 #ifndef WHISPER_USE_COREML
    struct ggml_tensor * cur;
 #ifndef WHISPER_USE_COREML
    const bool use_coreml = false;
 #else
    const bool use_coreml = wstate.ctx_coreml != nullptr;
 #endif
    if (!use_coreml) {
        // convolution + gelu
        {
            wstate.use_buf(ctx0, 1);
@ -1502,14 +1571,14 @@ static bool whisper_encode_internal(
                    ggml_permute(ctx0,
                            ggml_cpy(ctx0,
                                Qcur,
-                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
+                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                            0, 2, 1, 3);
                struct ggml_tensor * K =
                    ggml_permute(ctx0,
                            ggml_cpy(ctx0,
                                Kcur,
-                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
+                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                            0, 2, 1, 3);
                struct ggml_tensor * V =
@ -1519,7 +1588,7 @@ static bool whisper_encode_internal(
                                    Vcur,
                                    n_state/n_head, n_head, n_ctx),
                                1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
+                            ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
                struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
 #else
@ -1534,7 +1603,7 @@ static bool whisper_encode_internal(
                    ggml_permute(ctx0,
                            ggml_cpy(ctx0,
                                Kcur,
-                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
+                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                            0, 2, 1, 3);
                // K * Q
@ -1548,26 +1617,17 @@ static bool whisper_encode_internal(
                struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
            //struct ggml_tensor * V_trans =
            //    ggml_permute(ctx0,
            //            ggml_cpy(ctx0,
            //                Vcur,
            //                ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
            //            1, 2, 0, 3);
            //struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
                struct ggml_tensor * V =
                    ggml_cpy(ctx0,
                            ggml_permute(ctx0,
                                ggml_reshape_3d(ctx0,
                                    Vcur,
                                    n_state/n_head, n_head, n_ctx),
-                            0, 2, 1, 3),
+                                1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
+                            ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
                            );
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
+                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
 #endif
                struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@ -1622,7 +1682,7 @@ static bool whisper_encode_internal(
                wstate.use_buf(ctx0, 0);
                cur = ggml_flash_ff(ctx0,
-                ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
+                        ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
                        layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
 #else
                wstate.use_buf(ctx0, 0);
@ -1693,12 +1753,16 @@ static bool whisper_encode_internal(
            //ggml_graph_print(&gf);
        }
-#else
+    }
 #ifdef WHISPER_USE_COREML
    else
    {
        wstate.use_buf(ctx0, -1);
-    struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
+        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
        whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
    }
 #endif
    // cur
@ -2356,11 +2420,7 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float>
                sum += fft_out[k] * filters.data[j * n_fft + k];
            }
-            if (sum < 1e-10) {
+            sum = log10(std::max(sum, 1e-10));
                sum = 1e-10;
            }
            sum = log10(sum);
            mel.data[j * mel.n_len + i] = sum;
        }
@ -2540,9 +2600,9 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
 struct whisper_state * whisper_init_state(whisper_context * ctx) {
    whisper_state * state = new whisper_state;
-    const size_t scale = ctx->model.hparams.f16 ? 1 : 2;
+    const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
-    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
+    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
        fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
        delete state;
        return nullptr;
@ -2553,7 +2613,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
        fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
    }
-    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->wtype, ctx->model.hparams.n_audio_ctx)) {
+    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
        fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
        delete state;
        return nullptr;
@ -2573,10 +2633,12 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
    state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
    if (!state->ctx_coreml) {
        fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
 #ifndef WHISPER_COREML_ALLOW_FALLBACK
        return nullptr;
-    }
+#endif
-
+    } else {
        fprintf(stderr, "%s: Core ML model loaded\n", __func__);
    }
 #endif
    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
@ -2602,7 +2664,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
 }
 struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
    whisper_model_loader loader = {};
    fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
@ -2612,6 +2673,8 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
        return nullptr;
    }
    whisper_model_loader loader = {};
    loader.context = &fin;
    loader.read = [](void * ctx, void * output, size_t read_size) {
@ -2647,10 +2710,11 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
    };
    buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
    whisper_model_loader loader = {};
    fprintf(stderr, "%s: loading model from buffer\n", __func__);
    whisper_model_loader loader = {};
    loader.context = &ctx;
    loader.read = [](void * ctx, void * output, size_t read_size) {
@ -2747,8 +2811,10 @@ void whisper_free_state(struct whisper_state * state)
        }
 #ifdef WHISPER_USE_COREML
        if (state->ctx_coreml != nullptr) {
            whisper_coreml_free(state->ctx_coreml);
            state->ctx_coreml = nullptr;
        }
 #endif
        delete state;
@ -2909,7 +2975,6 @@ int whisper_lang_id(const char * lang) {
        fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang);
        return -1;
    }
    return g_lang.at(lang).first;
 }
@ -3047,8 +3112,8 @@ int whisper_model_n_mels(struct whisper_context * ctx) {
    return ctx->model.hparams.n_mels;
 }
-int whisper_model_f16(struct whisper_context * ctx) {
+int whisper_model_ftype(struct whisper_context * ctx) {
-    return ctx->model.hparams.f16;
+    return ctx->model.hparams.ftype;
 }
 int whisper_model_type(struct whisper_context * ctx) {
@ -3303,15 +3368,15 @@ static void whisper_exp_compute_token_level_timestamps(
 // trim from start (in place)
 static inline void ltrim(std::string &s) {
-    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
+    s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), [](unsigned char ch) {
-        return !std::isspace(ch);
+        return std::isspace(ch);
    }));
 }
 // trim from end (in place)
 static inline void rtrim(std::string &s) {
-    s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
+    s.erase(std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) {
-        return !std::isspace(ch);
+        return std::isspace(ch);
    }).base(), s.end());
 }
@ -3844,7 +3909,7 @@ int whisper_full_with_state(
    }
    const int seek_start = params.offset_ms/10;
-    const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len_from_state(state) : params.duration_ms/10);
+    const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
    // if length of spectrogram is less than 1s (100 samples), then return
    // basically don't process anything that is less than 1s
@ -4823,23 +4888,32 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
    // put a bunch of random data in the buffer
    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
    for (int j = 0; j < (int) sizes.size(); j++) {
        int n_q4_0 = 0;
        int n_q4_1 = 0;
        int n_fp16 = 0;
        int n_fp32 = 0;
        // GFLOPS/s
        double s_q4_0 = 0.0;
        double s_q4_1 = 0.0;
        double s_fp16 = 0.0;
        double s_fp32 = 0.0;
        const size_t N = sizes[j];
-        for (int k = 0; k < 2; ++k) {
+        for (int k = 0; k < 4; ++k) {
-            const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+            const ggml_type wtype =
                k == 0 ? GGML_TYPE_Q4_0 :
                k == 1 ? GGML_TYPE_Q4_1 :
                k == 2 ? GGML_TYPE_F16  :
                         GGML_TYPE_F32;
-            double & s = k == 0 ? s_fp16 : s_fp32;
+            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32;
-            int    & n = k == 0 ? n_fp16   : n_fp32;
+            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32;
            struct ggml_init_params gparams = {
                /*.mem_size   =*/ buf.size(),
@ -4883,8 +4957,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
            s = ((2.0*N*N*N*n)/tsum)*1e-9;
        }
-        snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
+        snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n",
-            N, N, s_fp16, n_fp16, s_fp32, n_fp32);
+                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32);
        s += strbuf;
    }
--- a/whisper.h
+++ b/whisper.h
@ -258,7 +258,7 @@ extern "C" {
    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_f16          (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_ftype        (struct whisper_context * ctx);
    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
    // Token logits obtained from the last call to whisper_decode()
Author	SHA1	Message	Date
Georgi Gerganov	fa8dbdc888	release : v1.4.0	2023-04-30 19:23:37 +03:00
Georgi Gerganov	4a7d49af95	examples : fix + refactor Levenshtein distance	2023-04-30 19:12:49 +03:00
Georgi Gerganov	794b162a46	whisper : add integer quantization support (#540 ) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples	2023-04-30 18:51:57 +03:00
Georgi Gerganov	5fd1bdd7fc	whisper : add GPU support via cuBLAS (#834 ) * make : add WHISPER_CUBLAS * make : fix CUBLAS build * whisper : disable Flash Attention + adjust memory buffers * whisper : remove old commented code * readme : add cuBLAS instructions * cmake : add WHISPER_CUBLAS option * gitignore : ignore build-cublas	2023-04-30 12:14:33 +03:00
Georgi Gerganov	0ccd6746c9	ggml : fix WASM build	2023-04-29 21:37:23 +03:00
Georgi Gerganov	d9b550c0a1	ggml : fix 32-bit ARM NEON (#836 ) * ggml : add support for 32-bit ARM * ggml : fix * ggml : fix	2023-04-29 21:33:33 +03:00
Georgi Gerganov	e9b091c92a	ggml : use vzip instead of vuzp for consistency	2023-04-29 21:14:09 +03:00
Georgi Gerganov	1f30b99208	ggml : fix WASM build	2023-04-29 20:21:25 +03:00
Georgi Gerganov	05c3ea3bc8	ggml : sync with ggml repo (warning fixes + asserts)	2023-04-29 19:33:28 +03:00
Thijs Raymakers	6108d3cc58	whisper : use correct seek_end when offset is used (#833 ) Whenever an `offset_ms` is provided, the value of `seek_end` is calculated incorrectly. This causes Whisper to keep transcribing after the end of the file. The current behavior looks like ``` [00:34:40.000 --> 00:34:47.000] This is an example audio file. [00:34:47.000 --> 00:34:49.000] The text has been redacted [00:34:49.000 --> 00:34:51.000] This is the end of the audio. [00:34:51.000 --> 00:34:52.000] * [00:34:52.000 --> 00:34:53.000] * [00:34:53.000 --> 00:34:54.000] * [00:34:55.000 --> 00:34:56.000] * ... ``` The expected behavior should be ``` [00:34:40.000 --> 00:34:47.000] This is an example audio file. [00:34:47.000 --> 00:34:49.000] The text has been redacted [00:34:49.000 --> 00:34:51.000] This is the end of the audio. - end of program - ``` This commit changes the calculation of the `seek_end` variable to only add `seek_start` if a custom `duration_ms` is provided. Otherwise, it defaults to the end of the file. Signed-off-by: Thijs Raymakers <thijs@raymakers.nl>	2023-04-29 18:55:37 +03:00
Georgi Gerganov	bab97c83d0	tests : add "threads" to run-tests.sh	2023-04-29 12:32:28 +03:00
Georgi Gerganov	3eaeb030ff	extra : add sync-ggml.sh script	2023-04-29 12:32:28 +03:00
Georgi Gerganov	acec73ab6e	ggml : sync latest ggml + llama.cpp updates (quantization)	2023-04-29 12:32:28 +03:00
Zollner	5cc17418c7	whisper.android : add some tips (#816 )	2023-04-29 11:00:20 +03:00
Georgi Gerganov	3efb81dec6	build : add WHISPER_COREML_ALLOW_FALLBACK to make / CMake (#812 )	2023-04-29 10:55:24 +03:00
Canis Lupus	94a7cd2a07	whisper : allow non-CoreML fallback when Core ML cannot be loaded (#812 ) if the Core ML model cannot be loaded, continue without Core ML instead of returning. This allows a single build to transcribe using Core ML models where available, and regular models when not.	2023-04-29 10:49:02 +03:00
Georgi Gerganov	3e82ff4747	whisper : fix bug from previous commit	2023-04-29 10:42:14 +03:00
Georgi Gerganov	b5bd2f43c5	whisper : avoid designated initializers	2023-04-29 10:36:50 +03:00
AsukaMinato	94aa56f19e	minor : improve C++ and Python style (#768 ) * use some STL functions * use self.field than setattr, use pathlib.Path * recover some format * const some iter * Keep the original * 2 space	2023-04-29 10:06:25 +03:00
Georgi Gerganov	4d89ee2e59	readme : add logo	2023-04-28 22:41:29 +03:00
Laytan Laats	70567eff23	main : escape quotes in csv output (#815 )	2023-04-23 19:01:59 +03:00
Taras Glek	02ec83c5d5	stream : flush upon finishing inference (#811 )	2023-04-23 17:00:30 +03:00
Philipp Zabel	2bd4b8d577	examples : add missing #include <cstdint> (#798 ) common.cpp uses uint8_t and uint64_t, which are defined in <cstdint>.	2023-04-23 16:52:52 +03:00
Tauseef Mohiuddin	eecf2c3d41	main : update escape_double_quotes() function (#776 ) Updated the escape_double_quotes() function such that the function now escapes both double quotes and backslashes in the input string. Changes Made: - Renamed the function to escape_quotes_and_backslashes - Modified the condition in the first loop to increment the value of 'escaped_length' for both double quotes and backslashes. - Modified the condition in second loop to add a backslash before the current character if it is a double quote or a backslash. Resolves: #769	2023-04-23 16:47:30 +03:00
		`@ -0,0 +1,3 @@`
							`# quantize`

							Tool for integer quantization of Whisper `ggml` model files