release : v1.4.2

ggml : add AVX dot products
talk-llama : fix build + sync latest llama.cpp
2025-07-04 08:20:57 +02:00 · 2023-05-14 19:06:45 +03:00 · 2023-05-14 18:56:46 +03:00 · 2023-05-14 18:46:42 +03:00 · 2023-05-14 18:11:08 +03:00 · 2023-05-14 18:09:44 +03:00
67 changed files with 12388 additions and 3920 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@ -236,6 +236,61 @@ jobs:
          name: whisper-blas-bin-${{ matrix.arch }}
          path: build/bin/${{ matrix.build }}
          
+  windows-cublas:
+    runs-on: windows-latest
+    
+    strategy:
+      matrix:
+        build: [Release]
+        arch: [x64]
+        cublas: [ON]
+        sdl2: [ON]
+        include:
+          - arch: x64
+            s2arc: x64
+          - sdl2: ON
+            s2ver: 2.26.0
+            
+    steps:
+      - name: Clone
+        uses: actions/checkout@v1
+
+      - name: Add msbuild to PATH
+        uses: microsoft/setup-msbuild@v1
+        
+      - name: Install CUDA Toolkit
+        id: cuda-toolkit
+        uses: Jimver/cuda-toolkit@v0.2.10
+        
+      - name: Fetch SDL2 and set SDL2_DIR
+        if: matrix.sdl2 == 'ON'
+        run: |
+          C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
+          7z x sdl2.zip
+          echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
+        
+      - name: Configure
+        run: >
+          cmake -S . -B ./build -A ${{ matrix.arch }}
+          -DCMAKE_BUILD_TYPE=${{ matrix.build }}
+          -DWHISPER_CUBLAS=1
+      
+      - name: Build
+        run: |
+          cd ./build
+          msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
+          
+      - name: Copy SDL2.dll
+        if: matrix.sdl2 == 'ON'
+        run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
+          
+      - name: Upload binaries
+        if: matrix.sdl2 == 'ON'
+        uses: actions/upload-artifact@v1
+        with:
+          name: whisper-cublas-bin-${{ matrix.arch }}
+          path: build/bin/${{ matrix.build }}
+
  emscripten:
    runs-on: ubuntu-latest

@ -278,7 +333,9 @@ jobs:
        uses: actions/checkout@v1

      - name: Configure
-        run: cp models/for-tests-ggml-base.en.bin models/ggml-base.en.bin
+        run: |
+          cp models/for-tests-ggml-base.en.bin models/ggml-base.en.bin
+          mkdir models/ggml-base.en-encoder.mlmodelc

      - name: Build objc example
        run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphonesimulator build
--- a/.gitignore
+++ b/.gitignore
@ -12,6 +12,7 @@ build-em/
 build-debug/
 build-release/
 build-static/
+build-cublas/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
@ -22,6 +23,7 @@ build-sanitize-thread/
 /talk
 /talk-llama
 /bench
+/quantize

 arm_neon.h
 sync.sh
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,10 +1,6 @@
 cmake_minimum_required (VERSION 3.0)

-project(whisper.cpp VERSION 1.3.0)
-
-if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
-    add_compile_options(/utf-8)
-endif ()
+project(whisper.cpp VERSION 1.4.2)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -51,17 +47,21 @@ option(WHISPER_SANITIZE_UNDEFINED      "whisper: enable undefined sanitizer" OFF
 option(WHISPER_BUILD_TESTS            "whisper: build tests"    ${WHISPER_STANDALONE})
 option(WHISPER_BUILD_EXAMPLES         "whisper: build examples" ${WHISPER_STANDALONE})

-option(WHISPER_SUPPORT_SDL2            "whisper: support for libSDL2" OFF)
+option(WHISPER_SDL2                   "whisper: support for libSDL2" OFF)

-if (APPLE)
-    option(WHISPER_NO_ACCELERATE       "whisper: disable Accelerate framework" OFF)
 option(WHISPER_NO_AVX                 "whisper: disable AVX"  OFF)
 option(WHISPER_NO_AVX2                "whisper: disable AVX2" OFF)
 option(WHISPER_NO_FMA                 "whisper: disable FMA"  OFF)
+option(WHISPER_NO_F16C                "whisper: disable F16c" OFF)

+if (APPLE)
+    option(WHISPER_NO_ACCELERATE         "whisper: disable Accelerate framework" OFF)
    option(WHISPER_COREML                "whisper: enable Core ML framework"     OFF)
+    option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback"    OFF)
 else()
-    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
+    option(WHISPER_OPENBLAS              "whisper: support for OpenBLAS" OFF)
+    option(WHISPER_CUBLAS                "whisper: support for cuBLAS"   OFF)
+    option(WHISPER_CLBLAST               "whisper: use CLBlast"          OFF)
 endif()

 option(WHISPER_PERF "whisper: enable perf timings" OFF)
@ -119,10 +119,14 @@ if (APPLE)
        else()
            message(WARNING "CoreML framework not found")
        endif()
+
+        if (WHISPER_COREML_ALLOW_FALLBACK)
+            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_COREML_ALLOW_FALLBACK)
+        endif()
    endif()
 endif()

-if (WHISPER_SUPPORT_OPENBLAS)
+if (WHISPER_OPENBLAS)
    find_library(OPENBLAS_LIB
        NAMES openblas libopenblas
        )
@ -136,6 +140,46 @@ if (WHISPER_SUPPORT_OPENBLAS)
    endif()
 endif()

+if (WHISPER_CUBLAS)
+    cmake_minimum_required(VERSION 3.17)
+
+    find_package(CUDAToolkit)
+
+    if (CUDAToolkit_FOUND)
+        message(STATUS "cuBLAS found")
+
+        enable_language(CUDA)
+
+        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
+
+        add_compile_definitions(GGML_USE_CUBLAS)
+
+        if (WHISPER_STATIC)
+            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+        else()
+            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+        endif()
+
+    else()
+        message(WARNING "cuBLAS not found")
+    endif()
+endif()
+
+if (WHISPER_CLBLAST)
+    find_package(CLBlast)
+    if (CLBlast_FOUND)
+        message(STATUS "CLBlast found")
+
+        set(GGML_OPENCL_SOURCES ggml-opencl.c ggml-opencl.h)
+
+        add_compile_definitions(GGML_USE_CLBLAST)
+
+        set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} clblast)
+    else()
+        message(WARNING "CLBlast not found")
+    endif()
+endif()
+
 # compiler flags

 if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@ -178,9 +222,17 @@ if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES
 else()
    message(STATUS "x86 detected")
    if (MSVC)
+            if(NOT WHISPER_NO_AVX2)
                set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
                set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
+            else()
+                if(NOT WHISPER_NO_AVX)
+                    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
+                    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
+                    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
+                endif()
+            endif()
    else()
        if (EMSCRIPTEN)
            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -pthread")
@ -242,6 +294,8 @@ set(TARGET whisper)
 add_library(${TARGET}
    ggml.h
    ggml.c
+    ${GGML_CUDA_SOURCES}
+    ${GGML_OPENCL_SOURCES}
    whisper.h
    whisper.cpp
    )
@ -271,7 +325,19 @@ if (BUILD_SHARED_LIBS)

    target_compile_definitions(${TARGET} PUBLIC
        WHISPER_SHARED
+        GGML_SHARED
        )
+
+    target_compile_definitions(${TARGET} PRIVATE
+        WHISPER_BUILD
+        GGML_BUILD
+        )
+endif()
+
+if (GGML_CUDA_SOURCES)
+    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
+    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
+    set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
 endif()

 if (EMSCRIPTEN)
--- a/70
+++ b/70
@ -1,3 +1,5 @@
+default: main bench quantize
+
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
 endif
@ -123,6 +125,7 @@ endif
 ifeq ($(UNAME_M),amd64)
 	CFLAGS += -mavx -mavx2 -mfma -mf16c
 endif
+
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
 	ifneq (,$(findstring POWER9,$(POWER9_M)))
@ -133,6 +136,7 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
 		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
 	endif
 endif
+
 ifndef WHISPER_NO_ACCELERATE
 	# Mac M1 - include Accelerate framework
 	ifeq ($(UNAME_S),Darwin)
@ -140,36 +144,71 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
+
 ifdef WHISPER_COREML
 	CXXFLAGS += -DWHISPER_USE_COREML
 	LDFLAGS  += -framework Foundation -framework CoreML
+
+ifdef WHISPER_COREML_ALLOW_FALLBACK
+	CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK
 endif
+endif
+
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
 endif
+
+ifdef WHISPER_CUBLAS
+	CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+	LDFLAGS     += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
+	WHISPER_OBJ += ggml-cuda.o
+	NVCC        = nvcc
+	NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=native
+
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
+endif
+
+ifdef WHISPER_CLBLAST
+	CFLAGS 		+= -DGGML_USE_CLBLAST
+	LDFLAGS	 	+= -lclblast -lOpenCL
+	WHISPER_OBJ	+= ggml-opencl.o
+	
+ggml-opencl.o: ggml-opencl.c ggml-opencl.h
+	$(CC) $(CFLAGS) -c $< -o $@
+endif
+
 ifdef WHISPER_GPROF
 	CFLAGS   += -pg
 	CXXFLAGS += -pg
 endif
+
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	CFLAGS   += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
+
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# 32-bit Raspberry Pi 1, 2, 3
 	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access
 endif
+
 ifneq ($(filter armv7%,$(UNAME_M)),)
 	# 32-bit ARM, for example on Armbian or possibly raspbian
-	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	#CFLAGS   += -mfpu=neon -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
+	#CXXFLAGS += -mfpu=neon -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access

-	# 64-bit ARM, use these (TODO: auto-detect 64-bit)
-	# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	# 64-bit ARM on 32-bit OS, use these (TODO: auto-detect 64-bit)
+	CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
+	CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
 endif
+
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 4
-	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
+	CFLAGS   += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
+	CXXFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -funsafe-math-optimizations -mno-unaligned-access
 endif

 #
@ -187,28 +226,26 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )

-default: main bench
-
 #
 # Build library
 #

-ggml.o: ggml.c ggml.h
-	$(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
+ggml.o: ggml.c ggml.h ggml-cuda.h
+	$(CC)  $(CFLAGS)   -c $< -o $@

-whisper.o: whisper.cpp whisper.h ggml.h
-	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
+whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@

 ifndef WHISPER_COREML
-WHISPER_OBJ = whisper.o
+WHISPER_OBJ += whisper.o
 else
 whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
-	$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
+	$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder.mm -o whisper-encoder.o

 whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
 	$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o

-WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
+WHISPER_OBJ += whisper.o whisper-encoder.o whisper-encoder-impl.o
 endif

 libwhisper.a: ggml.o $(WHISPER_OBJ)
@ -218,7 +255,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)

 clean:
-	rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so

 #
 # Examples
@ -226,7 +263,7 @@ clean:

 CC_SDL=`sdl2-config --cflags --libs`

-SRC_COMMON = examples/common.cpp
+SRC_COMMON     = examples/common.cpp examples/common-ggml.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp

 main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
@ -236,6 +273,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
 bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)

+quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
+	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o quantize $(LDFLAGS)
+
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)

--- a/README.md
+++ b/README.md
@ -1,10 +1,12 @@
 # whisper.cpp

+![whisper.cpp](https://user-images.githubusercontent.com/1991296/235238348-05d0f6a4-da44-4900-a1de-d0707e75b763.jpeg)
+
 [![Actions Status](https://github.com/ggerganov/whisper.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/whisper.cpp/actions)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Beta: [v1.3.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.3.0) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Beta: [v1.4.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.2) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -13,9 +15,12 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
+- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
 - Low memory usage (Flash Attention)
 - Zero memory allocations at runtime
 - Runs on the CPU
+- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
+- [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
 - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)

 Supported platforms:
@ -66,6 +71,8 @@ Then, download one of the Whisper models converted in [ggml format](models). For
 bash ./models/download-ggml-model.sh base.en
 ```

+If you wish to convert the Whisper models to ggml format yourself, instructions are in [models/README.md](models/README.md).
+
 Now build the [main](examples/main) example and transcribe an audio file like this:

 ```bash
@ -225,6 +232,22 @@ make large
 | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | large  | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |

+## Quantization
+
+`whisper.cpp` supports integer quantization of the Whisper `ggml` models.
+Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.
+
+Here are the steps for creating and using a quantized model:
+
+```bash
+# quantize a model with Q5_0 method
+make quantize
+./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
+
+# run the examples as usual, specifying the quantized model file
+./main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
+```
+
 ## Core ML support

 On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
@ -238,6 +261,12 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
  pip install coremltools
  ```

+  - To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
+  - Python 3.10 is recommended.
+  - [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html)  for this step:
+    - To create an environment, use: `conda create -n py310-whisper python=3.10 -y`
+    - To activate the environment, use: `conda activate py310-whisper`
+
 - Generate a Core ML model. For example, to generate a `base.en` model, use:

  ```bash
@ -279,10 +308,46 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in

 For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).

+## NVIDIA GPU support via cuBLAS
+
+With NVIDIA cards, the Encoder processing can be offloaded to the GPU to a large extend through cuBLAS.
+First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
+
+Now build `whisper.cpp` with cuBLAS support:
+
+```
+make clean
+WHISPER_CUBLAS=1 make -j
+```
+
+## OpenCL GPU support via CLBlast
+
+For cards and integrated GPUs that support OpenCL, the Encoder processing can be largely offloaded to the GPU through CLBlast. This is especially useful for users with AMD APU's or low end devices for up to ~2x speedup.
+
+First, make sure you have installed `CLBlast` for your OS or Distribution: https://github.com/CNugteren/CLBlast
+
+Now build `whisper.cpp` with CLBlast support:
+
+```
+Makefile:
+cd whisper.cpp
+make clean
+WHISPER_CLBLAST=1 make -j
+
+CMake:
+cd whisper.cpp ; mkdir build ; cd build
+cmake -DWHISPER_CLBLAST=ON  ..
+make clean
+make -j
+cp bin/* ../ 
+```
+
+
+Run all the examples as usual.
+
 ## Limitations

 - Inference only
- No GPU support (yet)

 ## Another example

--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.3.0",
+  "version": "1.4.2",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -1,5 +1,9 @@
-#import "coreml/whisper-encoder.h"
-#import "coreml/whisper-encoder-impl.h"
+#if !__has_feature(objc_arc)
+#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
+#endif
+
+#import "whisper-encoder.h"
+#import "whisper-encoder-impl.h"

 #import <CoreML/CoreML.h>

@ -51,15 +55,7 @@ void whisper_coreml_encode(

    whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];

-    MLMultiArray * outMA = outCoreML.output;
-
-    //NSArray<NSNumber *> * shape = outMA.shape;
-    //NSArray<NSNumber *> * strides = outMA.strides;
-
-    //printf("shape:   %ld %ld %ld %ld\n", [shape[0] longValue], [shape[1] longValue], [shape[2] longValue], [shape[3] longValue]);
-    //printf("strides: %ld %ld %ld %ld\n", [strides[0] longValue], [strides[1] longValue], [strides[2] longValue], [strides[3] longValue]);
-
-    memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
+    memcpy(out, outCoreML.output.dataPointer, outCoreML.output.count * sizeof(float));
 }

 #if __cplusplus
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -4,7 +4,7 @@ find_package(Threads REQUIRED)

 # third-party

-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # SDL2
    find_package(SDL2 REQUIRED)

@ -21,13 +21,17 @@ set(TARGET common)
 add_library(${TARGET} STATIC
    common.h
    common.cpp
+    common-ggml.h
+    common-ggml.cpp
    )

 include(DefaultTargetOptions)

+target_link_libraries(${TARGET} PRIVATE whisper)
+
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)

-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # common-sdl

    set(TARGET common-sdl)
@ -62,6 +66,7 @@ else()
    add_subdirectory(stream)
    add_subdirectory(command)
    add_subdirectory(bench)
+    add_subdirectory(quantize)
    add_subdirectory(talk)
    add_subdirectory(talk-llama)
 endif()
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -18,5 +18,6 @@ describe("Run whisper.node", () => {
        let result = await whisperAsync(whisperParamsMock);

        expect(result.length).toBeGreaterThan(0);
+    }, 10000);
 });
-});
+
--- a/examples/bench.wasm/CMakeLists.txt
+++ b/examples/bench.wasm/CMakeLists.txt
@ -31,9 +31,9 @@ endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1024MB \
-    -s TOTAL_MEMORY=1024MB \
+    -s PTHREAD_POOL_SIZE_STRICT=0 \
+    -s INITIAL_MEMORY=2000MB \
+    -s TOTAL_MEMORY=2000MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/bench.wasm/index-tmpl.html
+++ b/examples/bench.wasm/index-tmpl.html
@ -35,6 +35,15 @@

            <br><br>

+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <br><br>
+
            <hr>

            Select the model you would like to use and click the "Bench" button.<br>
@ -46,9 +55,16 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
-                <span id="fetch-whisper-progress"></span>
-
+                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
+                <button id="fetch-whisper-small-en-q5_1"  onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
+                <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
+                <button id="fetch-whisper-large-q5_0"     onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
+                <span id="fetch-whisper-progress"></span>
            </div>

            <br>
@ -160,6 +176,14 @@

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-small-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
+                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
+                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
+
                document.getElementById('whisper-file'         ).style.display = 'none';
                document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
            }
@ -168,11 +192,25 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
+                    'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
+                    'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
+                    'large-q5_0':    'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
                };

                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
+                    'small.en': 466,
+
+                    'tiny-en-q5_1':   31,
+                    'base-en-q5_1':   57,
+                    'small-en-q5_1':  182,
+                    'medium-en-q5_0': 515,
+                    'large-q5_0':     1030,
                };

                let url     = urls[model];
@ -181,6 +219,15 @@

                document.getElementById('fetch-whisper-tiny-en').style.display  = 'none';
                document.getElementById('fetch-whisper-base-en').style.display  = 'none';
+                document.getElementById('fetch-whisper-small-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
+                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
+                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
+
+                document.getElementById('whisper-file'        ).style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';

                cbProgress = function(p) {
@ -192,6 +239,15 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en');  if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en');  if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'  ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'  ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-large-q5_0'    ); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('whisper-file'        ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
                };

--- a/examples/command.wasm/emscripten.cpp
+++ b/examples/command.wasm/emscripten.cpp
@ -28,31 +28,6 @@ std::string g_transcribed   = "";

 std::vector<float> g_pcmf32;

-// compute similarity between two strings using Levenshtein distance
-static float similarity(const std::string & s0, const std::string & s1) {
-    const size_t len0 = s0.size() + 1;
-    const size_t len1 = s1.size() + 1;
-
-    std::vector<int> col(len1, 0);
-    std::vector<int> prevCol(len1, 0);
-
-    for (size_t i = 0; i < len1; i++) {
-        prevCol[i] = i;
-    }
-
-    for (size_t i = 0; i < len0; i++) {
-        col[0] = i;
-        for (size_t j = 1; j < len1; j++) {
-            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
-        }
-        col.swap(prevCol);
-    }
-
-    const float dist = prevCol[len1 - 1];
-
-    return 1.0f - (dist / std::max(s0.size(), s1.size()));
-}
-
 void command_set_status(const std::string & status) {
    std::lock_guard<std::mutex> lock(g_mutex);
    g_status = status;
--- a/examples/command.wasm/index-tmpl.html
+++ b/examples/command.wasm/index-tmpl.html
@ -35,6 +35,15 @@

            <br><br>

+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <br><br>
+
            <hr>

            Select the model you would like to use, click the "Start" button and follow the instructions.
@ -45,6 +54,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>

                <!--
@ -162,11 +175,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };

                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
+
+                    'tiny-en-q5_1':   31,
+                    'base-en-q5_1':   57,
                };

                let url     = urls[model];
@ -177,6 +196,10 @@

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
+
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';

                cbProgress = function(p) {
@ -188,6 +211,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
+
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };

--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@ -1,4 +1,4 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # command
    set(TARGET command)
    add_executable(${TARGET} command.cpp)
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -163,31 +163,6 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
    return result;
 }

-// compute similarity between two strings using Levenshtein distance
-float similarity(const std::string & s0, const std::string & s1) {
-    const size_t len0 = s0.size() + 1;
-    const size_t len1 = s1.size() + 1;
-
-    std::vector<int> col(len1, 0);
-    std::vector<int> prevCol(len1, 0);
-
-    for (size_t i = 0; i < len1; i++) {
-        prevCol[i] = i;
-    }
-
-    for (size_t i = 0; i < len0; i++) {
-        col[0] = i;
-        for (size_t j = 1; j < len1; j++) {
-            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
-        }
-        col.swap(prevCol);
-    }
-
-    const float dist = prevCol[len1 - 1];
-
-    return 1.0f - (dist / std::max(s0.size(), s1.size()));
-}
-
 std::vector<std::string> read_allowed_commands(const std::string & fname) {
    std::vector<std::string> allowed_commands;

--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -0,0 +1,235 @@
+#include "common-ggml.h"
+
+#include <regex>
+#include <map>
+
+static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
+    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
+    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
+    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
+    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
+    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
+};
+
+void ggml_print_ftypes(FILE * fp) {
+    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
+        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+    }
+}
+
+enum ggml_ftype ggml_parse_ftype(const char * str) {
+    enum ggml_ftype ftype;
+    if (str[0] == 'q') {
+        const auto it = GGML_FTYPE_MAP.find(str);
+        if (it == GGML_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
+            return GGML_FTYPE_UNKNOWN;
+        }
+        ftype = it->second;
+    } else {
+        ftype = (enum ggml_ftype) atoi(str);
+    }
+
+    return ftype;
+}
+
+bool ggml_common_quantize_0(
+        std::ifstream & finp,
+        std::ofstream & fout,
+        const ggml_ftype ftype,
+        const std::vector<std::string> & to_quant,
+        const std::vector<std::string> & to_skip) {
+
+    ggml_type qtype = GGML_TYPE_F32;
+
+    switch (ftype) {
+        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
+        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
+        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
+        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
+        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
+        case GGML_FTYPE_UNKNOWN:
+        case GGML_FTYPE_ALL_F32:
+        case GGML_FTYPE_MOSTLY_F16:
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
+                {
+                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
+                    return false;
+                }
+    };
+
+    if (!ggml_is_quantized(qtype)) {
+        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
+        return false;
+    }
+
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    std::vector<float> work;
+
+    std::vector<uint8_t>     data_u8;
+    std::vector<ggml_fp16_t> data_f16;
+    std::vector<float>       data_f32;
+
+    std::vector<int64_t> hist_all(1 << 4, 0);
+
+    while (true) {
+        int32_t n_dims;
+        int32_t length;
+        int32_t ttype;
+
+        finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+        finp.read(reinterpret_cast<char *>(&length), sizeof(length));
+        finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+        if (finp.eof()) {
+            break;
+        }
+
+        int32_t nelements = 1;
+        int32_t ne[4] = { 1, 1, 1, 1 };
+        for (int i = 0; i < n_dims; ++i) {
+            finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+            nelements *= ne[i];
+        }
+
+        std::string name(length, 0);
+        finp.read (&name[0], length);
+
+        printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype));
+
+        bool quantize = false;
+
+        // check if we should quantize this tensor
+        for (const auto & s : to_quant) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = true;
+                break;
+            }
+        }
+
+        // check if we should skip this tensor
+        for (const auto & s : to_skip) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = false;
+                break;
+            }
+        }
+
+        // quantize only 2D tensors
+        quantize &= (n_dims == 2);
+
+        if (quantize) {
+            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
+                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                return false;
+            }
+
+            if (ttype == GGML_TYPE_F16) {
+                data_f16.resize(nelements);
+                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
+                data_f32.resize(nelements);
+                for (int i = 0; i < nelements; ++i) {
+                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
+                }
+            } else {
+                data_f32.resize(nelements);
+                finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
+            }
+
+            ttype = qtype;
+        } else {
+            const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
+
+            data_u8.resize(nelements*bpe);
+            finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
+        }
+
+        fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+        fout.write(reinterpret_cast<char *>(&length), sizeof(length));
+        fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+        for (int i = 0; i < n_dims; ++i) {
+            fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+        }
+        fout.write(&name[0], length);
+
+        if (quantize) {
+            work.resize(nelements); // for quantization
+
+            size_t cur_size = 0;
+            std::vector<int64_t> hist_cur(1 << 4, 0);
+
+            switch ((ggml_type) ttype) {
+                case GGML_TYPE_Q4_0:
+                    {
+                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q4_1:
+                    {
+                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_0:
+                    {
+                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_1:
+                    {
+                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q8_0:
+                    {
+                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_I8:
+                case GGML_TYPE_I16:
+                case GGML_TYPE_I32:
+                case GGML_TYPE_Q8_1:
+                case GGML_TYPE_COUNT:
+                    {
+                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                        return false;
+                    }
+            }
+
+            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
+            total_size_new += cur_size;
+
+            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
+                hist_all[i] += hist_cur[i];
+            }
+
+            for (int i = 0; i < (int) hist_cur.size(); ++i) {
+                printf("%5.3f ", hist_cur[i] / (float)nelements);
+            }
+            printf("\n");
+        } else {
+            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
+            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
+            total_size_new += data_u8.size();
+        }
+
+        total_size_org += nelements * sizeof(float);
+    }
+
+    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
+    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
+
+    {
+        int64_t sum_all = 0;
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
+            sum_all += hist_all[i];
+        }
+
+        printf("%s: hist: ", __func__);
+        for (int i = 0; i < (int) hist_all.size(); ++i) {
+            printf("%5.3f ", hist_all[i] / (float)sum_all);
+        }
+        printf("\n");
+    }
+
+    return true;
+}
--- a/examples/common-ggml.h
+++ b/examples/common-ggml.h
@ -0,0 +1,18 @@
+#pragma once
+
+#include "ggml.h"
+
+#include <fstream>
+#include <vector>
+#include <string>
+
+enum ggml_ftype ggml_parse_ftype(const char * str);
+
+void ggml_print_ftypes(FILE * fp = stderr);
+
+bool ggml_common_quantize_0(
+        std::ifstream & finp,
+        std::ofstream & fout,
+        const ggml_ftype ftype,
+        const std::vector<std::string> & to_quant,
+        const std::vector<std::string> & to_skip);
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -6,12 +6,102 @@
 #include "dr_wav.h"

 #include <cmath>
+#include <fstream>
 #include <regex>

 #ifndef M_PI
 #define M_PI 3.14159265358979323846
 #endif

+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-p" || arg == "--prompt") {
+            params.prompt = argv[++i];
+        } else if (arg == "-n" || arg == "--n_predict") {
+            params.n_predict = std::stoi(argv[++i]);
+        } else if (arg == "--top_k") {
+            params.top_k = std::stoi(argv[++i]);
+        } else if (arg == "--top_p") {
+            params.top_p = std::stof(argv[++i]);
+        } else if (arg == "--temp") {
+            params.temp = std::stof(argv[++i]);
+        } else if (arg == "-b" || arg == "--batch_size") {
+            params.n_batch = std::stoi(argv[++i]);
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            gpt_print_usage(argc, argv, params);
+            exit(0);
+        } else if (arg == "-f" || arg == "--file") {
+            if (++i > argc) {
+                fprintf(stderr, "Invalid file param");
+                break;
+            }
+            std::ifstream file(argv[i]);
+            if (!file) {
+                fprintf(stderr, "error: failed to open file '%s'\n", argv[i]);
+                break;
+            }
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            gpt_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
+    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
+    fprintf(stderr, "  -f FNAME, --file FNAME\n");
+    fprintf(stderr, "                        load prompt from a file\n");
+    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
+    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
+    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
+    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
+    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "\n");
+}
+
+std::string gpt_random_prompt(std::mt19937 & rng) {
+    const int r = rng() % 10;
+    switch (r) {
+        case 0: return "So";
+        case 1: return "Once upon a time";
+        case 2: return "When";
+        case 3: return "The";
+        case 4: return "After";
+        case 5: return "If";
+        case 6: return "import";
+        case 7: return "He";
+        case 8: return "She";
+        case 9: return "They";
+        default: return "To";
+    }
+
+    return "The";
+}
+
 std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
@ -27,6 +117,269 @@ std::string replace(const std::string & s, const std::string & from, const std::
    return result;
 }

+std::map<std::string, int32_t> json_parse(const std::string & fname) {
+    std::map<std::string, int32_t> result;
+
+    // read file into string
+    std::string json;
+    {
+        std::ifstream ifs(fname);
+        if (!ifs) {
+            fprintf(stderr, "Failed to open %s\n", fname.c_str());
+            exit(1);
+        }
+
+        json = std::string((std::istreambuf_iterator<char>(ifs)),
+                (std::istreambuf_iterator<char>()));
+    }
+
+    if (json[0] != '{') {
+        return result;
+    }
+
+    // parse json
+    {
+        bool has_key  = false;
+        bool in_token = false;
+
+        std::string str_key = "";
+        std::string str_val = "";
+
+        int n = json.size();
+        for (int i = 1; i < n; ++i) {
+            if (!in_token) {
+                if (json[i] == ' ') continue;
+                if (json[i] == '"') {
+                    in_token = true;
+                    continue;
+                }
+            } else {
+                if (json[i] == '\\' && i+1 < n) {
+                    if (has_key == false) {
+                        str_key += json[i];
+                    } else {
+                        str_val += json[i];
+                    }
+                    ++i;
+                } else if (json[i] == '"') {
+                    if (has_key == false) {
+                        has_key = true;
+                        ++i;
+                        while (json[i] == ' ') ++i;
+                        ++i; // :
+                        while (json[i] == ' ') ++i;
+                        if (json[i] != '\"') {
+                            while (json[i] != ',' && json[i] != '}') {
+                                str_val += json[i++];
+                            }
+                            has_key = false;
+                        } else {
+                            in_token = true;
+                            continue;
+                        }
+                    } else {
+                        has_key = false;
+                    }
+
+                    str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
+                    str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
+                    str_key = ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
+
+                    try {
+                        result[str_key] = std::stoi(str_val);
+                    } catch (...) {
+                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
+
+                    }
+                    str_key = "";
+                    str_val = "";
+                    in_token = false;
+                    continue;
+                }
+                if (has_key == false) {
+                    str_key += json[i];
+                } else {
+                    str_val += json[i];
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+void gpt_vocab::add_special_token(const std::string & token) {
+    special_tokens.push_back(token);
+}
+
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
+    std::vector<std::string> words;
+
+    // first split the text into words
+    {
+        std::string str = text;
+        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+
+        // Generate the subpattern from the special_tokens vector if it's not empty
+        if (!vocab.special_tokens.empty()) {
+            std::string special_tokens_subpattern;
+            for (const auto & token : vocab.special_tokens) {
+                if (!special_tokens_subpattern.empty()) {
+                    special_tokens_subpattern += "|";
+                }
+                special_tokens_subpattern += token;
+            }
+
+            // Modify the regex pattern with the generated special tokens subpattern
+            pat = special_tokens_subpattern + "|" + pat;
+        }
+
+        std::regex re(pat);
+        std::smatch m;
+
+        while (std::regex_search(str, m, re)) {
+            for (auto x : m) {
+                words.push_back(x);
+            }
+            str = m.suffix();
+        }
+    }
+
+    // find the longest tokens that form the words:
+    std::vector<gpt_vocab::id> tokens;
+    for (const auto & word : words) {
+        if (word.size() == 0) continue;
+
+        int i = 0;
+        int n = word.size();
+        while (i < n) {
+            int j = n;
+            while (j > i) {
+                auto it = vocab.token_to_id.find(word.substr(i, j-i));
+                if (it != vocab.token_to_id.end()) {
+                    tokens.push_back(it->second);
+                    i = j;
+                    break;
+                }
+                --j;
+            }
+            if (i == n) {
+                break;
+            }
+            if (j == i) {
+                auto sub = word.substr(i, 1);
+                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
+                    tokens.push_back(vocab.token_to_id.at(sub));
+                } else {
+                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
+                }
+                ++i;
+            }
+        }
+    }
+
+    return tokens;
+}
+
+bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
+    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
+
+    vocab.token_to_id = ::json_parse(fname);
+
+    for (const auto & kv : vocab.token_to_id) {
+        vocab.id_to_token[kv.second] = kv.first;
+    }
+
+    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
+
+    // print the vocabulary
+    //for (auto kv : vocab.token_to_id) {
+    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
+    //}
+
+    return true;
+}
+
+gpt_vocab::id gpt_sample_top_k_top_p(
+        const gpt_vocab & vocab,
+        const float * logits,
+        int    top_k,
+        double top_p,
+        double temp,
+        std::mt19937 & rng) {
+    int n_logits = vocab.id_to_token.size();
+
+    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
+    logits_id.reserve(n_logits);
+
+    {
+        const double scale = 1.0/temp;
+        for (int i = 0; i < n_logits; ++i) {
+            logits_id.push_back(std::make_pair(logits[i]*scale, i));
+        }
+    }
+
+    // find the top K tokens
+    std::partial_sort(
+            logits_id.begin(),
+            logits_id.begin() + top_k, logits_id.end(),
+            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
+        return a.first > b.first;
+    });
+
+    logits_id.resize(top_k);
+
+    double maxl = -INFINITY;
+    for (const auto & kv : logits_id) {
+        maxl = std::max(maxl, kv.first);
+    }
+
+    // compute probs for the top K tokens
+    std::vector<double> probs;
+    probs.reserve(logits_id.size());
+
+    double sum = 0.0;
+    for (const auto & kv : logits_id) {
+        double p = exp(kv.first - maxl);
+        probs.push_back(p);
+        sum += p;
+    }
+
+    // normalize the probs
+    for (auto & p : probs) {
+        p /= sum;
+    }
+
+    if (top_p < 1.0f) {
+        double cumsum = 0.0f;
+        for (int i = 0; i < top_k; i++) {
+            cumsum += probs[i];
+            if (cumsum >= top_p) {
+                top_k = i + 1;
+                probs.resize(top_k);
+                logits_id.resize(top_k);
+                break;
+            }
+        }
+
+        cumsum = 1.0/cumsum;
+        for (int i = 0; i < (int) probs.size(); i++) {
+            probs[i] *= cumsum;
+        }
+    }
+
+    //printf("\n");
+    //for (int i = 0; i < (int) probs.size(); i++) {
+    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
+    //}
+    //exit(0);
+
+    std::discrete_distribution<> dist(probs.begin(), probs.end());
+    int idx = dist(rng);
+
+    return logits_id[idx].second;
+}
+
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    drwav wav;
    std::vector<uint8_t> wav_data; // used for pipe input from stdin
@ -160,3 +513,27 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float

    return true;
 }
+
+float similarity(const std::string & s0, const std::string & s1) {
+    const size_t len0 = s0.size() + 1;
+    const size_t len1 = s1.size() + 1;
+
+    std::vector<int> col(len1, 0);
+    std::vector<int> prevCol(len1, 0);
+
+    for (size_t i = 0; i < len1; i++) {
+        prevCol[i] = i;
+    }
+
+    for (size_t i = 0; i < len0; i++) {
+        col[0] = i;
+        for (size_t j = 1; j < len1; j++) {
+            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
+        }
+        col.swap(prevCol);
+    }
+
+    const float dist = prevCol[len1 - 1];
+
+    return 1.0f - (dist / std::max(s0.size(), s1.size()));
+}
--- a/examples/common.h
+++ b/examples/common.h
@ -1,10 +1,44 @@
+// Various helper functions and utilities
+
 #pragma once

-// needs to match WHISPER_SAMPLE_RATE
+#include <string>
+#include <map>
+#include <vector>
+#include <random>
+#include <thread>
+
 #define COMMON_SAMPLE_RATE 16000

-#include <vector>
-#include <string>
+//
+// CLI argument parsing
+//
+
+struct gpt_params {
+    int32_t seed      = -1; // RNG seed
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_predict = 200; // new tokens to predict
+
+    // sampling parameters
+    int32_t top_k = 40;
+    float   top_p = 0.9f;
+    float   temp  = 0.9f;
+
+    int32_t n_batch = 8; // batch size for prompt processing
+
+    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
+    std::string prompt;
+};
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+//
+// Vocab utils
+//

 std::string trim(const std::string & s);

@ -13,6 +47,55 @@ std::string replace(
        const std::string & from,
        const std::string & to);

+struct gpt_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+    std::vector<std::string> special_tokens;
+
+    void add_special_token(const std::string & token);
+};
+
+// poor-man's JSON parsing
+std::map<std::string, int32_t> json_parse(const std::string & fname);
+
+// split text into tokens
+//
+// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
+//
+// Regex (Python):
+// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+//
+// Regex (C++):
+// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
+//
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
+
+// load the tokens from encoder.json
+bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
+
+// sample next token given probabilities for each embedding
+//
+//   - consider only the top K tokens
+//   - from them, consider only the top tokens with cumulative probability > P
+//
+// TODO: not sure if this implementation is correct
+// TODO: temperature is not implemented
+//
+gpt_vocab::id gpt_sample_top_k_top_p(
+        const gpt_vocab & vocab,
+        const float * logits,
+        int    top_k,
+        double top_p,
+        double temp,
+        std::mt19937 & rng);
+
+//
+// Audio utils
+//
+
 // Read WAV audio file and store the PCM data into pcmf32
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
@ -38,3 +121,5 @@ bool vad_simple(
        float freq_thold,
        bool  verbose);

+// compute similarity between two strings using Levenshtein distance
+float similarity(const std::string & s0, const std::string & s1);
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -145,7 +145,15 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
                            var db = event.target.result;
                            var tx = db.transaction(['models'], 'readwrite');
                            var os = tx.objectStore('models');
+
+                            var rq = null;
+                            try {
                                var rq = os.put(data, url);
+                            } catch (e) {
+                                cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB: \n' + e);
+                                cbCancel();
+                                return;
+                            }

                            rq.onsuccess = function (event) {
                                cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
@ -180,7 +188,6 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {

    rq.onabort = function (event) {
        cbPrint('loadRemote: failed to open IndexedDB: abort');
-
+        cbCancel();
    };
 }
-
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -66,6 +66,7 @@ struct whisper_params {

    bool speed_up       = false;
    bool translate      = false;
+    bool detect_language= false;
    bool diarize        = false;
    bool split_on_word  = false;
    bool no_fallback    = false;
@ -141,6 +142,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-pp"   || arg == "--print-progress") { params.print_progress = true; }
        else if (arg == "-nt"   || arg == "--no-timestamps")  { params.no_timestamps  = true; }
        else if (arg == "-l"    || arg == "--language")       { params.language       = argv[++i]; }
+        else if (arg == "-dl"   || arg == "--detect-language"){ params.detect_language= true; }
        else if (                  arg == "--prompt")         { params.prompt         = argv[++i]; }
        else if (arg == "-m"    || arg == "--model")          { params.model          = argv[++i]; }
        else if (arg == "-f"    || arg == "--file")           { params.fname_inp.emplace_back(argv[++i]); }
@ -189,8 +191,9 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
    fprintf(stderr, "  -pp,       --print-progress    [%-7s] print progress\n",                                 params.print_progress ? "true" : "false");
-    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "false" : "true");
+    fprintf(stderr, "  -nt,       --no-timestamps     [%-7s] do not print timestamps\n",                        params.no_timestamps ? "true" : "false");
    fprintf(stderr, "  -l LANG,   --language LANG     [%-7s] spoken language ('auto' for auto-detect)\n",       params.language.c_str());
+    fprintf(stderr, "  -dl,       --detect-language   [%-7s] exit after automatically detecting language\n",    params.detect_language ? "true" : "false");
    fprintf(stderr, "             --prompt PROMPT     [%-7s] initial prompt\n",                                 params.prompt.c_str());
    fprintf(stderr, "  -m FNAME,  --model FNAME       [%-7s] model path\n",                                     params.model.c_str());
    fprintf(stderr, "  -f FNAME,  --file FNAME        [%-7s] input WAV file path\n",                            "");
@ -352,6 +355,37 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
    return true;
 }

+char *escape_double_quotes_and_backslashes(const char *str) {
+    if (str == NULL) {
+        return NULL;
+    }
+
+    size_t escaped_length = strlen(str) + 1;
+
+    for (size_t i = 0; str[i] != '\0'; i++) {
+        if (str[i] == '"' || str[i] == '\\') {
+            escaped_length++;
+        }
+    }
+
+    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
+    if (escaped == NULL) {
+        return NULL;
+    }
+
+    size_t pos = 0;
+    for (size_t i = 0; str[i] != '\0'; i++) {
+        if (str[i] == '"' || str[i] == '\\') {
+            escaped[pos++] = '\\';
+        }
+        escaped[pos++] = str[i];
+    }
+
+    // no need to set zero due to calloc() being used prior
+
+    return escaped;
+}
+
 bool output_csv(struct whisper_context * ctx, const char * fname) {
    std::ofstream fout(fname);
    if (!fout.is_open()) {
@ -367,47 +401,15 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+        char * text_escaped = escape_double_quotes_and_backslashes(text);

        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << "," << 10 * t1 << ",\"" << text    << "\"\n";
+        fout << 10 * t0 << "," << 10 * t1 << ",\"" << text_escaped    << "\"\n";
    }

    return true;
 }

-char *escape_double_quotes(const char *str) {
-    if (str == NULL) {
-        return NULL;
-    }
-
-    size_t escaped_length = strlen(str) + 1;
-
-    for (size_t i = 0; str[i] != '\0'; i++) {
-        if (str[i] == '"') {
-            escaped_length++;
-        }
-    }
-
-    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
-    if (escaped == NULL) {
-        return NULL;
-    }
-
-    size_t pos = 0;
-    for (size_t i = 0; str[i] != '\0'; i++) {
-        if (str[i] == '"') {
-            escaped[pos++] = '\\';
-            escaped[pos++] = '"';
-        } else {
-            escaped[pos++] = str[i];
-        }
-    }
-
-    // no need to set zero due to calloc() being used prior
-
-    return escaped;
-}
-
 bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
    std::ofstream fout(fname);
    int indent = 0;
@ -451,7 +453,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper

    auto value_s = [&](const char *name, const char *val, bool end = false) {
        start_value(name);
-        char * val_escaped = escape_double_quotes(val);
+        char * val_escaped = escape_double_quotes_and_backslashes(val);
        fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
        free(val_escaped);
    };
@ -497,7 +499,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
                value_i("layer", whisper_model_n_text_layer(ctx), true);
            end_obj();
            value_i("mels", whisper_model_n_mels(ctx));
-            value_i("f16", whisper_model_f16(ctx), true);
+            value_i("ftype", whisper_model_ftype(ctx), true);
        end_obj();
        start_obj("params");
            value_s("model", params.model.c_str());
@ -740,6 +742,9 @@ int main(int argc, char ** argv) {
                    fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
                }
            }
+            if (params.detect_language) {
+                params.language = "auto";
+            }
            fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n",
                    __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
                    params.n_threads, params.n_processors,
@ -762,6 +767,7 @@ int main(int argc, char ** argv) {
            wparams.print_special    = params.print_special;
            wparams.translate        = params.translate;
            wparams.language         = params.language.c_str();
+            wparams.detect_language  = params.detect_language;
            wparams.n_threads        = params.n_threads;
            wparams.n_max_text_ctx   = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
            wparams.offset_ms        = params.offset_t_ms;
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@ -0,0 +1,6 @@
+set(TARGET quantize)
+add_executable(${TARGET} quantize.cpp)
+
+include(DefaultTargetOptions)
+
+target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@ -0,0 +1,3 @@
+# quantize
+
+Tool for integer quantization of Whisper `ggml` model files
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -0,0 +1,221 @@
+#include "ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <regex>
+
+// default hparams (Whisper tiny)
+struct whisper_hparams {
+    int32_t n_vocab       = 51864;
+    int32_t n_audio_ctx   = 1500;
+    int32_t n_audio_state = 384;
+    int32_t n_audio_head  = 6;
+    int32_t n_audio_layer = 4;
+    int32_t n_text_ctx    = 448;
+    int32_t n_text_state  = 384;
+    int32_t n_text_head   = 6;
+    int32_t n_text_layer  = 4;
+    int32_t n_mels        = 80;
+    int32_t ftype         = 1;
+};
+
+struct whisper_filters {
+    int32_t n_mel;
+    int32_t n_fft;
+
+    std::vector<float> data;
+};
+
+// quantize a model
+bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+    gpt_vocab vocab;
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *) &magic, sizeof(magic));
+        if (magic != 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *) &magic, sizeof(magic));
+    }
+
+    whisper_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
+        finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
+        finp.read((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
+        finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
+        finp.read((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
+        finp.read((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
+        finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
+        finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
+        finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
+        finp.read((char *) &hparams.ftype,         sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
+        fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
+        fprintf(stderr, "%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
+        fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
+        fprintf(stderr, "%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
+        fprintf(stderr, "%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
+        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
+        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
+        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
+        fprintf(stderr, "%s: ftype (src)   = %d\n", __func__, hparams.ftype);
+        fprintf(stderr, "%s: qntvr (src)   = %d\n", __func__, qntvr_src);
+        fprintf(stderr, "%s: ftype (dst)   = %d\n", __func__, ftype_dst);
+        fprintf(stderr, "%s: qntvr (dst)   = %d\n", __func__, GGML_QNT_VERSION);
+
+        fout.write((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
+        fout.write((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
+        fout.write((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
+        fout.write((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
+        fout.write((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
+        fout.write((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
+        fout.write((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
+        fout.write((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
+        fout.write((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
+        fout.write((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
+        fout.write((char *) &ftype_dst,             sizeof(hparams.ftype));
+    }
+
+    // load mel filters
+    {
+        whisper_filters filters;
+
+        finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
+        fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
+        finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
+        fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
+
+        filters.data.resize(filters.n_mel * filters.n_fft);
+        finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
+        fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        finp.read ((char *) &n_vocab, sizeof(n_vocab));
+        fout.write((char *) &n_vocab, sizeof(n_vocab));
+
+        //if (n_vocab != hparams.n_vocab) {
+        //    fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+        //            __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
+        //    return false;
+        //}
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read ((char *) &len, sizeof(len));
+            fout.write((char *) &len, sizeof(len));
+
+            word.resize(len);
+            finp.read ((char *) word.data(), len);
+            fout.write((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // regexes of tensor names to not be quantized
+    const std::vector<std::string> to_skip = {
+        //"encoder.*",
+        "encoder.conv1.bias",
+        "encoder.conv2.bias",
+        "encoder.positional_embedding",
+        "decoder.positional_embedding",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    return 0;
+}
--- a/examples/stream.wasm/index-tmpl.html
+++ b/examples/stream.wasm/index-tmpl.html
@ -35,6 +35,15 @@

            <br><br>

+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <br><br>
+
            <hr>

            Select the model you would like to use, click the "Start" button and start speaking
@ -45,6 +54,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>

                <!--
@ -162,11 +175,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };

                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
+
+                    'tiny-en-q5_1':   31,
+                    'base-en-q5_1':   57,
                };

                let url     = urls[model];
@ -177,6 +196,10 @@

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
+
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';

                cbProgress = function(p) {
@ -188,6 +211,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
+
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };

--- a/examples/stream/CMakeLists.txt
+++ b/examples/stream/CMakeLists.txt
@ -1,4 +1,4 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # stream
    set(TARGET stream)
    add_executable(${TARGET} stream.cpp)
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -383,6 +383,7 @@ int main(int argc, char ** argv) {
                    }
                }
            }
+            fflush(stdout);
        }
    }

--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,4 +1,4 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # talk-llama
    set(TARGET talk-llama)
    #add_executable(${TARGET} talk-llama.cpp llama.cpp)
--- a/examples/talk-llama/README.md
+++ b/examples/talk-llama/README.md
@ -25,6 +25,20 @@ make talk-llama
 - The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
 - The `-ml` argument specifies the LLaMA model that you would like to use. Read the instructions in https://github.com/ggerganov/llama.cpp for information about how to obtain a `ggml` compatible LLaMA model

+## Session
+
+The `talk-llama` tool supports session management to enable more coherent and continuous conversations. By maintaining context from previous interactions, it can better understand and respond to user requests in a more natural way.
+
+To enable session support, use the `--session FILE` command line option when running the program. The `talk-llama` model state will be saved to the specified file after each interaction. If the file does not exist, it will be created. If the file exists, the model state will be loaded from it, allowing you to resume a previous session.
+
+This feature is especially helpful for maintaining context in long conversations or when interacting with the AI assistant across multiple sessions. It ensures that the assistant remembers the previous interactions and can provide more relevant and contextual responses.
+
+Example usage:
+
+```bash
+./talk-llama --session ./my-session-file -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin -p "Georgi" -t 8
+```
+
 ## TTS

 For best experience, this example needs a TTS tool to convert the generated text responses to voice.
--- a/examples/talk-llama/llama-util.h
+++ b/examples/talk-llama/llama-util.h
@ -21,12 +21,17 @@
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
        #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
    #endif
 #endif

 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
        #define NOMINMAX
+    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h> // for _fseeki64
@ -41,8 +46,12 @@
    } while (0)

 #ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
 __attribute__((format(printf, 1, 2)))
 #endif
+#endif
 static std::string format(const char * fmt, ...) {
    va_list ap, ap2;
    va_start(ap, fmt);
@ -55,7 +64,7 @@ static std::string format(const char * fmt, ...) {
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
-};
+}

 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
@ -162,7 +171,7 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;

-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
@ -170,17 +179,18 @@ struct llama_mmap {
        flags |= MAP_POPULATE;
 #endif
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        close(fd);
        if (addr == MAP_FAILED) {
            throw format("mmap failed: %s", strerror(errno));
        }

+        if (prefetch) {
            // Advise the kernel to preload the mapped memory
            if (madvise(addr, file->size, MADV_WILLNEED)) {
                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
        }
+    }

    ~llama_mmap() {
        munmap(addr, size);
@ -188,14 +198,13 @@ struct llama_mmap {
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;

-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;

        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));

        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
        DWORD error = GetLastError();
-        CloseHandle(hFile);

        if (hMapping == NULL) {
            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@ -209,6 +218,8 @@ struct llama_mmap {
            throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
        }

+        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
+        if (prefetch) {
            // Advise the kernel to preload the mapped memory
            WIN32_MEMORY_RANGE_ENTRY range;
            range.VirtualAddress = addr;
@ -218,6 +229,10 @@ struct llama_mmap {
                        llama_format_win_err(GetLastError()).c_str());
            }
        }
+        #else
+        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
+        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
+    }

    ~llama_mmap() {
        if (!UnmapViewOfFile(addr)) {
@ -291,8 +306,18 @@ struct llama_mlock {
        if (!mlock(addr, size)) {
            return true;
        } else {
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
-                    size, this->size, std::strerror(errno));
+            char* errmsg = std::strerror(errno);
+            bool suggest = (errno == ENOMEM);
+
+            // Check if the resource limit is fine after all
+            struct rlimit lock_limit;
+            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
+                suggest = false;
+            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
+                suggest = false;
+
+            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
            return false;
        }
    }
@ -338,8 +363,8 @@ struct llama_mlock {
            // Hopefully a megabyte is enough overhead:
            size_t increment = size + 1048576;
            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += size;
-            max_ws_size += size;
+            min_ws_size += increment;
+            max_ws_size += increment;
            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
                        llama_format_win_err(GetLastError()).c_str());
@ -380,4 +405,29 @@ struct llama_buffer {
        delete[] addr;
    }
 };
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+struct llama_ctx_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        if (addr) {
+            ggml_cuda_host_free(addr);
+        }
+        addr = (uint8_t *) ggml_cuda_host_malloc(size);
+        this->size = size;
+    }
+
+    ~llama_ctx_buffer() {
+        if (addr) {
+            ggml_cuda_host_free(addr);
+        }
+    }
+};
+#else
+typedef llama_buffer llama_ctx_buffer;
+#endif
+
 #endif
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -19,9 +19,11 @@
 #    define LLAMA_API
 #endif

-#define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
-#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
+#define LLAMA_FILE_VERSION           2
+#define LLAMA_FILE_MAGIC             'ggjt'
+#define LLAMA_FILE_MAGIC_UNVERSIONED 'ggml'
+#define LLAMA_SESSION_MAGIC          'ggsn'
+#define LLAMA_SESSION_VERSION        1

 #ifdef __cplusplus
 extern "C" {
@ -39,18 +41,23 @@ extern "C" {

    typedef struct llama_token_data {
        llama_token id;  // token id
-
+        float logit; // log-odds of the token
        float p;     // probability of the token
-        float plog;  // log probability of the token
-
    } llama_token_data;

+    typedef struct llama_token_data_array {
+        llama_token_data * data;
+        size_t size;
+        bool sorted;
+    } llama_token_data_array;
+
    typedef void (*llama_progress_callback)(float progress, void *ctx);

    struct llama_context_params {
        int n_ctx;        // text context
        int n_parts;      // -1 for default
-        int seed;    // RNG seed, 0 for random
+        int n_gpu_layers; // number of layers to store in VRAM
+        int seed;         // RNG seed, -1 for random

        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
@ -65,6 +72,20 @@ extern "C" {
        void * progress_callback_user_data;
    };

+    // model file types
+    enum llama_ftype {
+        LLAMA_FTYPE_ALL_F32     = 0,
+        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        // LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // support has been removed
+        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+    };
+
    LLAMA_API struct llama_context_params llama_context_default_params();

    LLAMA_API bool llama_mmap_supported();
@ -82,27 +103,47 @@ extern "C" {

    // TODO: not great API - very likely to change
    // Returns 0 on success
+    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-                   int   itype);
+      enum llama_ftype   ftype,
+            int          nthread);

-    // Returns the KV cache that will contain the context for the
-    // ongoing prediction with the model.
-    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
-
-    // Returns the size of the KV cache
-    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+    // Apply a LoRA adapter to a loaded model
+    // path_base_model is the path to a higher quality model to use as a base for
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
+    // will be applied on top of the previous one
+    // Returns 0 on success
+    LLAMA_API int llama_apply_lora_from_file(
+            struct llama_context * ctx,
+                      const char * path_lora,
+                      const char * path_base_model,
+                             int   n_threads);

    // Returns the number of tokens in the KV cache
-    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
+    LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);

-    // Sets the KV cache containing the current context for the model
-    LLAMA_API void llama_set_kv_cache(
-            struct llama_context * ctx,
-                   const uint8_t * kv_cache,
-                          size_t   n_size,
-                             int   n_token_count);
+    // Sets the current rng seed.
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
+
+    // Returns the maximum size in bytes of the state (rng, logits, embedding
+    // and kv_cache) - will often be smaller after compacting tokens
+    LLAMA_API size_t llama_get_state_size(const struct llama_context * ctx);
+
+    // Copies the state to the specified destination address.
+    // Destination needs to have allocated enough memory.
+    // Returns the number of bytes copied
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst);
+
+    // Set the state reading from the specified address
+    // Returns the number of bytes read
+    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
+
+    // Save/load session file
+    LLAMA_API bool llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
+    LLAMA_API bool llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);

    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
@ -127,9 +168,9 @@ extern "C" {
                             int   n_max_tokens,
                            bool   add_bos);

-    LLAMA_API int llama_n_vocab(struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (struct llama_context * ctx);
+    LLAMA_API int llama_n_vocab(const struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx  (const struct llama_context * ctx);
+    LLAMA_API int llama_n_embd (const struct llama_context * ctx);

    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
@ -143,21 +184,57 @@ extern "C" {
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);

    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
+    LLAMA_API const char * llama_token_to_str(const struct llama_context * ctx, llama_token token);

    // Special tokens
    LLAMA_API llama_token llama_token_bos();
    LLAMA_API llama_token llama_token_eos();
+    LLAMA_API llama_token llama_token_nl();

-    // TODO: improve the last_n_tokens interface ?
-    LLAMA_API llama_token llama_sample_top_p_top_k(
-       struct llama_context * ctx,
-          const llama_token * last_n_tokens_data,
-                        int   last_n_tokens_size,
-                        int   top_k,
-                      float   top_p,
-                      float   temp,
-                      float   repeat_penalty);
+    // Sampling functions
+
+    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float penalty);
+
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+
+    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
+
+    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep);
+
+    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
+
+    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep);
+
+    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep);
+    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+
+    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
+
+    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
+
+    /// @details Selects the token with the highest probability.
+    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
+
+    /// @details Randomly selects a token from the candidates based on their probabilities.
+    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);

    // Performance information
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
@ -170,4 +247,15 @@ extern "C" {
 }
 #endif

+// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
+#ifdef LLAMA_API_INTERNAL
+
+#include <vector>
+#include <string>
+struct ggml_tensor;
+
+std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+
+#endif
+
 #endif // LLAMA_H
--- a/examples/talk-llama/llama_internal.h
+++ b/examples/talk-llama/llama_internal.h
@ -1,12 +0,0 @@
-// Internal header to be included by llama.cpp and tests/benchmarks only.
-
-#ifndef LLAMA_INTERNAL_H
-#define LLAMA_INTERNAL_H
-
-#include <vector>
-#include <string>
-struct ggml_tensor;
-
-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
-
-#endif // LLAMA_INTERNAL_H
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -52,6 +52,7 @@ struct whisper_params {
    std::string speak       = "./examples/talk-llama/speak.sh";
    std::string prompt      = "";
    std::string fname_out;
+    std::string path_session = "";       // path to file for saving/loading model eval state
 };

 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
@ -78,6 +79,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
        else if (arg == "--verbose-prompt")                 { params.verbose_prompt = true; }
        else if (arg == "-p"   || arg == "--person")        { params.person        = argv[++i]; }
+        else if (arg == "--session")                        { params.path_session  = argv[++i];}
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp     = argv[++i]; }
        else if (arg == "-ml"  || arg == "--model-llama")   { params.model_llama   = argv[++i]; }
@ -124,6 +126,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  --n-parts-llama N         [%-7d] num parts in llama model file\n",               params.n_parts_llama);
    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
    fprintf(stderr, "  --prompt-file FNAME       [%-7s] file with custom prompt to start dialog\n",     "");
+    fprintf(stderr, "  --session FNAME       file to cache model state in (may be large!) (default: none)\n");
    fprintf(stderr, "  --verbose-prompt          [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "\n");
@ -330,10 +333,38 @@ int main(int argc, char ** argv) {

    prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);

-    // evaluate the initial prompt
-
+    // init session
+    std::string path_session = params.path_session;
+    std::vector<llama_token> session_tokens;
    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);

+    if (!path_session.empty()) {
+        fprintf(stderr, "%s: attempting to load saved session from %s\n", __func__, path_session.c_str());
+
+        // fopen to check for existing session
+        FILE * fp = std::fopen(path_session.c_str(), "rb");
+        if (fp != NULL) {
+            std::fclose(fp);
+
+            session_tokens.resize(lparams.n_ctx);
+            size_t n_token_count_out = 0;
+            if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
+                fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
+                return 1;
+            }
+            session_tokens.resize(n_token_count_out);
+            for (size_t i = 0; i < session_tokens.size(); i++) {
+                embd_inp[i] = session_tokens[i];
+            }
+
+            fprintf(stderr, "%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size());
+        } else {
+            fprintf(stderr, "%s: session file does not exist, will create\n", __func__);
+        }
+    }
+
+    // evaluate the initial prompt
+
    printf("\n");
    printf("%s : initializing - please wait ...\n", __func__);

@ -348,6 +379,31 @@ int main(int argc, char ** argv) {
        fflush(stdout);
    }

+     // debug message about similarity of saved session, if applicable
+    size_t n_matching_session_tokens = 0;
+    if (session_tokens.size()) {
+        for (llama_token id : session_tokens) {
+            if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) {
+                break;
+            }
+            n_matching_session_tokens++;
+        }
+        if (n_matching_session_tokens >= embd_inp.size()) {
+            fprintf(stderr, "%s: session file has exact match for prompt!\n", __func__);
+        } else if (n_matching_session_tokens < (embd_inp.size() / 2)) {
+            fprintf(stderr, "%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        } else {
+            fprintf(stderr, "%s: session file matches %zu / %zu tokens of prompt\n",
+                __func__, n_matching_session_tokens, embd_inp.size());
+        }
+    }
+
+    // HACK - because session saving incurs a non-negligible delay, for now skip re-saving session
+    // if we loaded a session with at least 75% similarity. It's currently just used to speed up the
+    // initial prompt so it doesn't need to be an exact match.
+    bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < (embd_inp.size() * 3 / 4);
+
    printf("%s : done! start speaking in the microphone\n", __func__);
    printf("\n");
    printf("%s%s", params.person.c_str(), chat_symb.c_str());
@ -363,6 +419,7 @@ int main(int argc, char ** argv) {

    int n_past = n_keep;
    int n_prev = 64; // TODO arg
+    int n_session_consumed = !path_session.empty() && session_tokens.size() > 0 ? session_tokens.size() : 0;

    std::vector<llama_token> embd;

@ -439,6 +496,11 @@ int main(int argc, char ** argv) {

                embd = ::llama_tokenize(ctx_llama, text_heard, false);

+                // Append the new input tokens to the session_tokens vector
+                if (!path_session.empty()) {
+                    session_tokens.insert(session_tokens.end(), tokens.begin(), tokens.end());
+                }
+
                // text inference
                bool done = false;
                std::string text_to_speak;
@ -450,7 +512,8 @@ int main(int argc, char ** argv) {

                            // insert n_left/2 tokens at the start of embd from last_n_tokens
                            embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
-
+                            // stop saving session if we run out of context
+                            path_session = "";
                            //printf("\n---\n");
                            //printf("resetting: '");
                            //for (int i = 0; i < (int) embd.size(); i++) {
@ -460,16 +523,44 @@ int main(int argc, char ** argv) {
                            //printf("\n---\n");
                        }

+                        // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past)
+                        // REVIEW
+                        if (n_session_consumed < (int) session_tokens.size()) {
+                            size_t i = 0;
+                            for ( ; i < embd.size(); i++) {
+                                if (embd[i] != session_tokens[n_session_consumed]) {
+                                    session_tokens.resize(n_session_consumed);
+                                    break;
+                                }
+
+                                n_past++;
+                                n_session_consumed++;
+
+                                if (n_session_consumed >= (int) session_tokens.size()) {
+                                    i++;
+                                    break;
+                                }
+                            }
+                            if (i > 0) {
+                                embd.erase(embd.begin(), embd.begin() + i);
+                            }
+                        }
+
+                        if (embd.size() > 0 && !path_session.empty()) {
+                            session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
+                            n_session_consumed = session_tokens.size();
+                        }
+
                        if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
                            fprintf(stderr, "%s : failed to eval\n", __func__);
                            return 1;
                        }
                    }

-                    //printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());

                    embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
                    n_past += embd.size();
+
                    embd.clear();

                    if (done) break;
@ -483,15 +574,46 @@ int main(int argc, char ** argv) {

                        const int repeat_last_n    = 256;

+                        if (!path_session.empty() && need_to_save_session) {
+                            need_to_save_session = false;
+                            llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
+                        }
+
                        llama_token id = 0;

                        {
                            auto logits = llama_get_logits(ctx_llama);
+                            auto n_vocab = llama_n_vocab(ctx_llama);
+
                            logits[llama_token_eos()] = 0;

-                            id = llama_sample_top_p_top_k(ctx_llama,
+                            std::vector<llama_token_data> candidates;
+                            candidates.reserve(n_vocab);
+                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                            }
+
+                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+                            // apply repeat penalty
+                            const float nl_logit = logits[llama_token_nl()];
+
+                            llama_sample_repetition_penalty(ctx_llama, &candidates_p,
                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
-                                    repeat_last_n, top_k, top_p, temp, repeat_penalty);
+                                    repeat_last_n, repeat_penalty);
+
+                            logits[llama_token_nl()] = nl_logit;
+
+                            if (temp <= 0) {
+                                // Greedy sampling
+                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
+                            } else {
+                                // Temperature sampling
+                                llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
+                                llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
+                                llama_sample_temperature(ctx_llama, &candidates_p, temp);
+                                id = llama_sample_token(ctx_llama, &candidates_p);
+                            }
                        }

                        if (id != llama_token_eos()) {
@ -516,6 +638,7 @@ int main(int argc, char ** argv) {
                                done = true;
                                text_to_speak = ::replace(text_to_speak, antiprompt, "");
                                fflush(stdout);
+                                need_to_save_session = true;
                                break;
                            }
                        }
--- a/examples/talk.wasm/CMakeLists.txt
+++ b/examples/talk.wasm/CMakeLists.txt
@ -13,6 +13,7 @@ include(DefaultTargetOptions)

 target_link_libraries(${TARGET} PRIVATE
    whisper
+    common
    )

 unset(EXTRA_FLAGS)
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -1,4 +1,6 @@
 #include "ggml.h"
+#include "common-ggml.h"
+
 #include "gpt-2.h"

 #include <cmath>
@ -14,150 +16,6 @@

 /////////////////////// GPT-2 BEGIN /////////////////////////

-//
-// Vocab utils
-//
-
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-
-        std::regex re(pat);
-        std::smatch m;
-
-        while (std::regex_search(str, m, re)) {
-            for (auto x : m) {
-                words.push_back(x);
-            }
-            str = m.suffix();
-        }
-    }
-
-    // find the longest tokens that form the words:
-    std::vector<gpt_vocab::id> tokens;
-    for (const auto & word : words) {
-        if (word.size() == 0) continue;
-
-        int i = 0;
-        int n = word.size();
-        while (i < n) {
-            int j = n;
-            while (j > i) {
-                auto it = vocab.token_to_id.find(word.substr(i, j-i));
-                if (it != vocab.token_to_id.end()) {
-                    tokens.push_back(it->second);
-                    i = j;
-                    break;
-                }
-                --j;
-            }
-            if (i == n) {
-                break;
-            }
-            if (j == i) {
-                auto sub = word.substr(i, 1);
-                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
-                    tokens.push_back(vocab.token_to_id.at(sub));
-                } else {
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
-                }
-                ++i;
-            }
-        }
-    }
-
-    return tokens;
-}
-
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const gpt_vocab & vocab,
-        const float * logits,
-        int    top_k,
-        double top_p,
-        double temp,
-        std::mt19937 & rng) {
-    int n_logits = vocab.id_to_token.size();
-
-    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    for (int i = 0; i < n_logits; i++) {
-        logits_id.push_back(std::make_pair(logits[i], i));
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    // normalize
-    {
-        double sum = 0.0f;
-        for (int i = 0; i < (int)logits_id.size(); i++) {
-            sum += logits_id[i].first;
-        }
-
-        sum = 1.0/sum;
-        for (int i = 0; i < (int)logits_id.size(); i++) {
-            logits_id[i].first *= sum;
-        }
-    }
-
-    if (top_p < 1.0f) {
-        {
-            double cumsum = 0.0f;
-            for (int i = 0; i < top_k; i++) {
-                cumsum += logits_id[i].first;
-                if (cumsum >= top_p) {
-                    logits_id.resize(i+1);
-                    break;
-                }
-            }
-        }
-
-        // normalize again
-        {
-            double sum = 0.0f;
-            for (int i = 0; i < (int)logits_id.size(); i++) {
-                sum += logits_id[i].first;
-            }
-
-            sum = 1.0/sum;
-            for (int i = 0; i < (int)logits_id.size(); i++) {
-                logits_id[i].first *= sum;
-            }
-        }
-    }
-
-    //printf("\n");
-    //for (int i = 0; i < (int)logits_id.size(); i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
-    //}
-    //exit(0);
-
-    // sample from the obtained distribution
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    for (int i = 0; i < (int) logits_id.size(); i++) {
-        probs.push_back(logits_id[i].first);
-    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-}
-
 // default hparams (GPT-2 117M)
 struct gpt2_hparams {
    int32_t n_vocab = 50257;
@ -165,7 +23,7 @@ struct gpt2_hparams {
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };

 struct gpt2_layer {
@ -187,7 +45,7 @@ struct gpt2_layer {
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;

-    struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
+    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
 };

@ -200,6 +58,7 @@ struct gpt2_model {

    struct ggml_tensor * wte;     // position embedding
    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head

    std::vector<gpt2_layer> layers;

@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));

        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    }

    // load vocab
@ -275,9 +134,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        }
    }

-    // for the big tensors, we have the option to store the data in 16-bit floats
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }

    auto & ctx = model.ctx;

@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

-        ctx_size += n_vocab*n_embd*ggml_type_size(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head

-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b

-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b

-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b

-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_size(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = NULL;
+        struct ggml_init_params params = {
+            .mem_size   = ctx_size,
+            .mem_buffer = NULL,
+            .no_alloc   = false,
+        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -352,6 +219,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);

        // map by name
        model.tensors["model/ln_f/g"] = model.ln_f_g;
@ -359,6 +227,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

        model.tensors["model/wte"]     = model.wte;
        model.tensors["model/wpe"]     = model.wpe;
+        model.tensors["model/lm_head"] = model.lm_head;

        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];
@ -369,16 +238,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

-            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);

            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

-            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);

-            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            // map by name
@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;

-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
        }
    }
@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    {
        size_t total_size = 0;

+        bool has_lm_head = false;
+
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;

            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));

            if (fin.eof()) {
                break;
@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                return false;
            }

-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }

-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

-            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            // GPT-2 models share the WTE tensor as the LM head
+            if (name == "model/wte" && has_lm_head == false) {
+                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            }
+
+            if (name == "model/lm_head") {
+                has_lm_head = true;
+            }
+
            total_size += ggml_nbytes(tensor);
        }

@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted probabilities of the next token
+//   - embd_w:    the predicted logits for the next token
 //
 bool gpt2_eval(
        const gpt2_model & model,
@ -512,12 +396,12 @@ bool gpt2_eval(
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;

-    static size_t buf_size = 640u*1024*1024;
+    static size_t buf_size = 512u*1024*1024;
    static void * buf = malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);

        // reallocate
        buf_size = buf_size_new;
@ -528,12 +412,13 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);
-
    struct ggml_cgraph gf = {};
    gf.n_threads = n_threads;

@ -578,7 +463,7 @@ bool gpt2_eval(
        // [2304, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
+                    model.layers[il].c_attn_attn_w,
                    cur);

            cur = ggml_add(ctx0,
@ -654,11 +539,13 @@ bool gpt2_eval(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            // [n_past + N, 64, 12]
            struct ggml_tensor * V_trans =
+                ggml_cpy(ctx0,
                        ggml_permute(ctx0,
                            ggml_reshape_3d(ctx0,
                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));

            // KQV = transpose(V) * KQ_soft_max
            // [64, N, 12]
@ -685,7 +572,7 @@ bool gpt2_eval(
        // [768, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    model.layers[il].c_attn_proj_w,
                    cur);

            cur = ggml_add(ctx0,
@ -722,7 +609,7 @@ bool gpt2_eval(
            // cur = fc_w*cur + fc_b
            // [3072, N]
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    model.layers[il].c_mlp_fc_w,
                    cur);

            cur = ggml_add(ctx0,
@ -742,7 +629,7 @@ bool gpt2_eval(
            // cur = proj_w*cur + proj_b
            // [768, N]
            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w_trans,
+                    model.layers[il].c_mlp_proj_w,
                    cur);

            cur = ggml_add(ctx0,
@ -769,12 +656,12 @@ bool gpt2_eval(
    }

    // inpL = WTE * inpL
-    // [ 768, 50257] - model.wte
+    // [ 768, 50257] - model.lm_head
    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.wte, inpL);
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);

    // logits -> probs
-    inpL = ggml_soft_max(ctx0, inpL);
+    //inpL = ggml_soft_max(ctx0, inpL);

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@ -788,7 +675,7 @@ bool gpt2_eval(
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);

-    // return result for just the last token
+    // return result just for the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);

@ -825,7 +712,7 @@ Me too.
    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());

    // sampling parameters
-    int32_t top_k = 40;
+    int32_t top_k = 5;
    float   top_p = 0.9f;
    float   temp  = 1.0f;
 };
@ -833,14 +720,14 @@ Me too.
 struct gpt2_context * gpt2_init(const char * path_model) {
    gpt2_context * ctx = new gpt2_context;

-    ctx->rng = std::mt19937(time(NULL));
+    ctx->rng = std::mt19937(time(nullptr));

    // load the model
    {
        const int64_t t_start_us = ggml_time_us();

        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
            delete ctx;
            return nullptr;
        }
@ -885,9 +772,9 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)

    std::string result;

-    for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
+    for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
        // predict
-        if (embd.size() > 0) {
+        if (!embd.empty()) {
            if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
                printf("gpt-2: failed to generate text\n");
                return "";
@ -914,10 +801,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
        result += ctx->vocab.id_to_token[embd[0]];

        // end of text token
-        if (embd.back() == 50256 ||
-            ctx->vocab.id_to_token[embd.back()] == "." ||
-            ctx->vocab.id_to_token[embd.back()] == "!" ||
-            ctx->vocab.id_to_token[embd.back()] == "?") {
+        if (embd.back() == 50256) {
            break;
        }
    }
--- a/examples/talk.wasm/gpt-2.h
+++ b/examples/talk.wasm/gpt-2.h
@ -2,18 +2,12 @@

 // TODO: Change to C-style API and move to ./examples for easy reuse.

+#include "common.h"
+
 #include <vector>
 #include <map>
 #include <string>

-struct gpt_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-};
-
 struct gpt2_context;

 struct gpt2_context * gpt2_init(const char * path_model);
--- a/examples/talk.wasm/index-tmpl.html
+++ b/examples/talk.wasm/index-tmpl.html
@ -44,6 +44,15 @@

            <br><br>

+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <br><br>
+
            <hr>

            Select the models you would like to use and click the "Start" button to begin the conversation
@ -54,6 +63,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>

                <!--
@ -266,11 +279,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };

                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
+
+                    'tiny-en-q5_1':   31,
+                    'base-en-q5_1':   57,
                };

                let url     = urls[model];
@ -281,6 +300,10 @@

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
+
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';

                cbProgress = function(p) {
@ -292,6 +315,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
+
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };

--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@ -1,16 +1,8 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
    # talk
    set(TARGET talk)
-    #add_executable(${TARGET} talk.cpp gpt-2.cpp)
-    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
-    #target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-
-    # TODO: this is temporary
-    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
+    add_executable(${TARGET} talk.cpp gpt-2.cpp)
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})

    include(DefaultTargetOptions)
-
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -1,4 +1,6 @@
 #include "ggml.h"
+#include "common-ggml.h"
+
 #include "gpt-2.h"

 #include <cmath>
@ -14,150 +16,6 @@

 /////////////////////// GPT-2 BEGIN /////////////////////////

-//
-// Vocab utils
-//
-
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-
-        std::regex re(pat);
-        std::smatch m;
-
-        while (std::regex_search(str, m, re)) {
-            for (auto x : m) {
-                words.push_back(x);
-            }
-            str = m.suffix();
-        }
-    }
-
-    // find the longest tokens that form the words:
-    std::vector<gpt_vocab::id> tokens;
-    for (const auto & word : words) {
-        if (word.empty()) continue;
-
-        int i = 0;
-        int n = word.size();
-        while (i < n) {
-            int j = n;
-            while (j > i) {
-                auto it = vocab.token_to_id.find(word.substr(i, j-i));
-                if (it != vocab.token_to_id.end()) {
-                    tokens.push_back(it->second);
-                    i = j;
-                    break;
-                }
-                --j;
-            }
-            if (i == n) {
-                break;
-            }
-            if (j == i) {
-                auto sub = word.substr(i, 1);
-                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
-                    tokens.push_back(vocab.token_to_id.at(sub));
-                } else {
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
-                }
-                ++i;
-            }
-        }
-    }
-
-    return tokens;
-}
-
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const gpt_vocab & vocab,
-        const float * logits,
-        int    top_k,
-        double top_p,
-        double /*temp*/,
-        std::mt19937 & rng) {
-    int n_logits = vocab.id_to_token.size();
-
-    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    for (int i = 0; i < n_logits; i++) {
-        logits_id.emplace_back(logits[i], i);
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    // normalize
-    {
-        double sum = 0.0f;
-        for (int i = 0; i < (int)logits_id.size(); i++) {
-            sum += logits_id[i].first;
-        }
-
-        sum = 1.0/sum;
-        for (int i = 0; i < (int)logits_id.size(); i++) {
-            logits_id[i].first *= sum;
-        }
-    }
-
-    if (top_p < 1.0f) {
-        {
-            double cumsum = 0.0f;
-            for (int i = 0; i < top_k; i++) {
-                cumsum += logits_id[i].first;
-                if (cumsum >= top_p) {
-                    logits_id.resize(i+1);
-                    break;
-                }
-            }
-        }
-
-        // normalize again
-        {
-            double sum = 0.0f;
-            for (int i = 0; i < (int)logits_id.size(); i++) {
-                sum += logits_id[i].first;
-            }
-
-            sum = 1.0/sum;
-            for (int i = 0; i < (int)logits_id.size(); i++) {
-                logits_id[i].first *= sum;
-            }
-        }
-    }
-
-    //printf("\n");
-    //for (int i = 0; i < (int) logits_id.size(); i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
-    //}
-    //exit(0);
-
-    // sample from the obtained distribution
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    for (int i = 0; i < (int) logits_id.size(); i++) {
-        probs.push_back(logits_id[i].first);
-    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-}
-
 // default hparams (GPT-2 117M)
 struct gpt2_hparams {
    int32_t n_vocab = 50257;
@ -165,7 +23,7 @@ struct gpt2_hparams {
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };

 struct gpt2_layer {
@ -187,7 +45,7 @@ struct gpt2_layer {
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;

-    struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
+    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
 };

@ -200,6 +58,7 @@ struct gpt2_model {

    struct ggml_tensor * wte;     // position embedding
    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head

    std::vector<gpt2_layer> layers;

@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));

        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    }

    // load vocab
@ -268,16 +127,21 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            fin.read((char *) &len, sizeof(len));

            word.resize(len);
-            fin.read((char *) &word[0], len);
+            fin.read((char *) word.data(), len);

            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
        }
    }

-    // for the big tensors, we have the option to store the data in 16-bit floats
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }

    auto & ctx = model.ctx;

@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

-        ctx_size += n_vocab*n_embd*ggml_type_size(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head

-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b

-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b

-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b

-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_size(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = nullptr;
+        struct ggml_init_params params = {
+            .mem_size   = ctx_size,
+            .mem_buffer = NULL,
+            .no_alloc   = false,
+        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -352,6 +219,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);

        // map by name
        model.tensors["model/ln_f/g"] = model.ln_f_g;
@ -359,6 +227,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

        model.tensors["model/wte"]     = model.wte;
        model.tensors["model/wpe"]     = model.wpe;
+        model.tensors["model/lm_head"] = model.lm_head;

        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];
@ -369,16 +238,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

-            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);

            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

-            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);

-            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            // map by name
@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;

-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
        }
    }
@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    {
        size_t total_size = 0;

+        bool has_lm_head = false;
+
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;

            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));

            if (fin.eof()) {
                break;
@ -448,7 +319,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            std::string name(length, 0);
            fin.read(&name[0], length);

-            if (model.tensors.find(name) == model.tensors.end()) {
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
                return false;
            }
@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                return false;
            }

-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }

-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

-            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            // GPT-2 models share the WTE tensor as the LM head
+            if (name == "model/wte" && has_lm_head == false) {
+                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            }
+
+            if (name == "model/lm_head") {
+                has_lm_head = true;
+            }
+
            total_size += ggml_nbytes(tensor);
        }

@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted probabilities of the next token
+//   - embd_w:    the predicted logits for the next token
 //
 bool gpt2_eval(
        const gpt2_model & model,
@ -512,12 +396,12 @@ bool gpt2_eval(
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;

-    static size_t buf_size = 5640ull*1024*1024;
+    static size_t buf_size = 512u*1024*1024;
    static void * buf = malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);

        // reallocate
        buf_size = buf_size_new;
@ -528,12 +412,13 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);
-
    struct ggml_cgraph gf = {};
    gf.n_threads = n_threads;

@ -578,7 +463,7 @@ bool gpt2_eval(
        // [2304, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
+                    model.layers[il].c_attn_attn_w,
                    cur);

            cur = ggml_add(ctx0,
@ -654,11 +539,13 @@ bool gpt2_eval(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            // [n_past + N, 64, 12]
            struct ggml_tensor * V_trans =
+                ggml_cpy(ctx0,
                        ggml_permute(ctx0,
                            ggml_reshape_3d(ctx0,
                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));

            // KQV = transpose(V) * KQ_soft_max
            // [64, N, 12]
@ -685,7 +572,7 @@ bool gpt2_eval(
        // [768, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    model.layers[il].c_attn_proj_w,
                    cur);

            cur = ggml_add(ctx0,
@ -722,7 +609,7 @@ bool gpt2_eval(
            // cur = fc_w*cur + fc_b
            // [3072, N]
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    model.layers[il].c_mlp_fc_w,
                    cur);

            cur = ggml_add(ctx0,
@ -742,7 +629,7 @@ bool gpt2_eval(
            // cur = proj_w*cur + proj_b
            // [768, N]
            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w_trans,
+                    model.layers[il].c_mlp_proj_w,
                    cur);

            cur = ggml_add(ctx0,
@ -769,12 +656,12 @@ bool gpt2_eval(
    }

    // inpL = WTE * inpL
-    // [ 768, 50257] - model.wte
+    // [ 768, 50257] - model.lm_head
    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.wte, inpL);
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);

    // logits -> probs
-    inpL = ggml_soft_max(ctx0, inpL);
+    //inpL = ggml_soft_max(ctx0, inpL);

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@ -788,7 +675,7 @@ bool gpt2_eval(
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);

-    // return result for just the last token
+    // return result just for the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);

--- a/examples/talk/gpt-2.h
+++ b/examples/talk/gpt-2.h
@ -2,18 +2,12 @@

 // TODO: Change to C-style API and move to ./examples for easy reuse.

+#include "common.h"
+
 #include <vector>
 #include <map>
 #include <string>

-struct gpt_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-};
-
 struct gpt2_context;

 struct gpt2_context * gpt2_init(const char * path_model);
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -9,4 +9,6 @@ To use:
 5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
 [^1]: I recommend the tiny or base models for running on an Android device.

+(PS: Do not move this android project folder individually to other folders, because this android project folder depends on the files of the whole project.)
+
 <img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -14,15 +14,24 @@ https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-b
 ```java
 git clone https://github.com/ggerganov/whisper.cpp
 open whisper.cpp/examples/whisper.objc/whisper.objc.xcodeproj/
+
+// If you don't want to convert a Core ML model, you can skip this step by create dummy model
+mkdir models/ggml-base.en-encoder.mlmodelc
 ```

 Make sure to build the project in `Release`:

 <img width="947" alt="image" src="https://user-images.githubusercontent.com/1991296/197382607-9e1e6d1b-79fa-496f-9d16-b71dc1535701.png">

-Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Phases.
+Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag for `ggml.c` in Build Phases.
 This can significantly improve the performance of the transcription:

 <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">

+If you want to enable Core ML support, you can add the `-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK` compiler flag for `whisper.cpp` in Build Phases:
+
+<img width="1072" alt="image" src="https://github.com/ggerganov/whisper.cpp/assets/3001525/103e8f57-6eb6-490d-a60c-f6cf6c319324">
+
+Then follow the [`Core ML support` section of readme](../../README.md#core-ml-support) for convert the model.
+
 In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -14,9 +14,13 @@
 		18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8529052BE000BD2A04 /* Assets.xcassets */; };
 		18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */; };
 		18627C8C29052BE000BD2A04 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8B29052BE000BD2A04 /* main.m */; };
-		18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; };
+		18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK"; }; };
 		18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE"; }; };
 		18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
+		7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
+		7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
+		7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */; };
+		7FE3424F2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc in Resources */ = {isa = PBXBuildFile; fileRef = 7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */; };
 /* End PBXBuildFile section */

 /* Begin PBXFileReference section */
@ -37,6 +41,13 @@
 		18627C9529052C5800BD2A04 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml.c; sourceTree = "<group>"; };
 		18627C9729052C6600BD2A04 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml.h; sourceTree = "<group>"; };
 		18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = "ggml-base.en.bin"; path = "../../../models/ggml-base.en.bin"; sourceTree = "<group>"; };
+		7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
+		7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = "<group>"; };
+		7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = "<group>"; };
+		7FE342482A0C3FA20015A058 /* whisper-decoder-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-decoder-impl.h"; sourceTree = "<group>"; };
+		7FE342492A0C3FA20015A058 /* whisper-encoder-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder-impl.h"; sourceTree = "<group>"; };
+		7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-decoder-impl.m"; sourceTree = "<group>"; };
+		7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = "ggml-base.en-encoder.mlmodelc"; path = "../../../models/ggml-base.en-encoder.mlmodelc"; sourceTree = "<group>"; };
 /* End PBXFileReference section */

 /* Begin PBXFrameworksBuildPhase section */
@ -69,6 +80,8 @@
 		18627C7829052BDF00BD2A04 /* whisper.objc */ = {
 			isa = PBXGroup;
 			children = (
+				7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */,
+				7FE342442A0C3FA20015A058 /* coreml */,
 				18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */,
 				18627C9729052C6600BD2A04 /* ggml.h */,
 				18627C9529052C5800BD2A04 /* ggml.c */,
@ -89,6 +102,20 @@
 			path = whisper.objc;
 			sourceTree = "<group>";
 		};
+		7FE342442A0C3FA20015A058 /* coreml */ = {
+			isa = PBXGroup;
+			children = (
+				7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */,
+				7FE342462A0C3FA20015A058 /* whisper-encoder.h */,
+				7FE342472A0C3FA20015A058 /* whisper-encoder.mm */,
+				7FE342482A0C3FA20015A058 /* whisper-decoder-impl.h */,
+				7FE342492A0C3FA20015A058 /* whisper-encoder-impl.h */,
+				7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */,
+			);
+			name = coreml;
+			path = ../../../coreml;
+			sourceTree = "<group>";
+		};
 /* End PBXGroup section */

 /* Begin PBXNativeTarget section */
@ -147,6 +174,7 @@
 			buildActionMask = 2147483647;
 			files = (
 				18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */,
+				7FE3424F2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc in Resources */,
 				18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */,
 				18627C8429052BDF00BD2A04 /* Main.storyboard in Resources */,
 				18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */,
@ -161,11 +189,14 @@
 			buildActionMask = 2147483647;
 			files = (
 				18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
+				7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */,
 				18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
 				18627C9629052C5800BD2A04 /* ggml.c in Sources */,
 				18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
+				7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
 				18627C8C29052BE000BD2A04 /* main.m in Sources */,
 				18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
+				7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */,
 			);
 			runOnlyForDeploymentPostprocessing = 0;
 		};
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -31,9 +31,9 @@ endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1500MB \
-    -s TOTAL_MEMORY=1500MB \
+    -s PTHREAD_POOL_SIZE_STRICT=0 \
+    -s INITIAL_MEMORY=2000MB \
+    -s TOTAL_MEMORY=2000MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/whisper.wasm/README.md
+++ b/examples/whisper.wasm/README.md
@ -38,5 +38,5 @@ make -j

 # copy the produced page to your HTTP path
 cp bin/whisper.wasm/*    /path/to/html/
-cp bin/libwhisper.worker.js /path/to/html/
+cp bin/libmain.worker.js /path/to/html/
 ```
--- a/examples/whisper.wasm/emscripten.cpp
+++ b/examples/whisper.wasm/emscripten.cpp
@ -10,6 +10,12 @@ std::thread g_worker;

 std::vector<struct whisper_context *> g_contexts(4, nullptr);

+static inline int mpow2(int n) {
+    int p = 1;
+    while (p <= n) p *= 2;
+    return p/2;
+}
+
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        if (g_worker.joinable()) {
@ -43,7 +49,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        }
    }));

-    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
+    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, int nthreads, bool translate) {
        if (g_worker.joinable()) {
            g_worker.join();
        }
@ -66,7 +72,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        params.print_special    = false;
        params.translate        = translate;
        params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
-        params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
+        params.n_threads        = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency())));
        params.offset_ms        = 0;

        std::vector<float> pcmf32;
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -40,21 +40,42 @@

            Note that the computation is quite heavy and may take a few seconds to complete.<br>
            The transcription results will be displayed in the text area below.<br><br>
-            <b>Important: your browser must support WASM SIMD instructions for this to work.</b>
+            <b>Important:</b>
+                <ul>
+                    <li>your browser must support WASM SIMD instructions for this to work</li>
+                    <li>Firefox cannot load files larger than 256 MB - use Chrome instead</li>
+                </ul>

-            <br><br><hr>
+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <hr>

            <div id="model">
-                Whisper model: <span id="model-whisper-status"></span>
+                Whisper models: <span id="model-whisper-status"></span><br><br>
                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-tiny"     onclick="loadWhisper('tiny')">tiny (75 MB)</button>
                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <button id="fetch-whisper-base"     onclick="loadWhisper('base')">base (142 MB)</button>
                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                <button id="fetch-whisper-small"    onclick="loadWhisper('small')">small (466 MB)</button>
-                <span id="fetch-whisper-progress"></span>
-
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-tiny-q5_1"      onclick="loadWhisper('tiny-q5_1')">tiny (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
+                <button id="fetch-whisper-base-q5_1"      onclick="loadWhisper('base-q5_1')">base (Q5_1, 57 MB)</button>
+                <button id="fetch-whisper-small-en-q5_1"  onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
+                <button id="fetch-whisper-small-q5_1"     onclick="loadWhisper('small-q5_1')">small (Q5_1, 182 MB)</button><br>
+                <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
+                <button id="fetch-whisper-medium-q5_0"    onclick="loadWhisper('medium-q5_0')">medium (Q5_0, 515 MB)</button>
+                <button id="fetch-whisper-large-q5_0"     onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
+                <span id="fetch-whisper-progress"></span>
            </div>

            <br>
@ -161,6 +182,12 @@
                            <option value="yi">Yiddish</option>
                        </select>
                    </td>
+                    <!-- Slider to select number of threads between 1 and 16 -->
+                    <td>
+                        Threads:
+                        <input type="range" id="threads" name="threads" min="1" max="16" value="8" onchange="changeThreads(this.value)" />
+                        <span id="threads-value">8</span>
+                    </td>
                    <td>
                        <button onclick="onProcess(false);">Transcribe</button>
                    </td>
@ -263,11 +290,13 @@

                Module.FS_createDataFile("/", fname, buf, true, true);

-                model_whisper = fname;
+                //model_whisper = fname;

                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';

                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
+
+                document.getElementById('model').innerHTML = 'Model fetched: ' + model_whisper;
            }

            function loadFile(event, fname) {
@ -292,6 +321,17 @@
                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-tiny-q5_1'     ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-q5_1'     ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-q5_1'    ).style.display = 'none';
+                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
+                document.getElementById('fetch-whisper-medium-q5_0'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
+
                document.getElementById('whisper-file'          ).style.display = 'none';
                document.getElementById('model-whisper-status'  ).innerHTML = 'loaded model: ' + file.name;
            }
@ -304,6 +344,16 @@
                    'base':     'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
                    'small':    'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'tiny-q5_1':     'https://whisper.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
+                    'base-q5_1':     'https://whisper.ggerganov.com/ggml-model-whisper-base-q5_1.bin',
+                    'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
+                    'small-q5_1':    'https://whisper.ggerganov.com/ggml-model-whisper-small-q5_1.bin',
+                    'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
+                    'medium-q5_0':   'https://whisper.ggerganov.com/ggml-model-whisper-medium-q5_0.bin',
+                    'large-q5_0':    'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
                };

                let sizes = {
@ -313,6 +363,16 @@
                    'base':     142,
                    'small.en': 466,
                    'small':    466,
+
+                    'tiny-en-q5_1':   31,
+                    'tiny-q5_1':      31,
+                    'base-en-q5_1':   57,
+                    'base-q5_1':      57,
+                    'small-en-q5_1':  182,
+                    'small-q5_1':     182,
+                    'medium-en-q5_0': 515,
+                    'medium-q5_0':    515,
+                    'large-q5_0':     1030,
                };

                let url     = urls[model];
@ -327,6 +387,17 @@
                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-tiny-q5_1'     ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-q5_1'     ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-q5_1'    ).style.display = 'none';
+                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
+                document.getElementById('fetch-whisper-medium-q5_0'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
+
                document.getElementById('whisper-file'        ).style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading model: ' + model;

@ -337,12 +408,24 @@

                cbCancel = function() {
                    var el;
+
                    el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small'   ); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'  ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-tiny-q5_1'     ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'  ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-q5_1'     ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-q5_1'    ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-medium-q5_0'   ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-large-q5_0'    ); if (el) el.style.display = 'inline-block';
+
                    el = document.getElementById('whisper-file'        ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
                };
@ -354,7 +437,8 @@
            // audio file
            //

-            const kMaxAudio_s = 120;
+            const kMaxAudio_s = 30*60;
+            const kMaxRecording_s = 2*60;
            const kSampleRate = 16000;

            window.AudioContext = window.AudioContext || window.webkitAudioContext;
@ -423,7 +507,7 @@
                doRecording = false;
            }

-            // record up to kMaxAudio_s seconds of audio from the microphone
+            // record up to kMaxRecording_s seconds of audio from the microphone
            // check if doRecording is false every 1000 ms and stop recording if so
            // update progress information
            function startRecording() {
@ -479,9 +563,9 @@
                                        printTextarea('js: audio recorded, size: ' + audio.length);

                                        // truncate to first 30 seconds
-                                        if (audio.length > kMaxAudio_s*kSampleRate) {
-                                            audio = audio.slice(0, kMaxAudio_s*kSampleRate);
-                                            printTextarea('js: truncated audio to first ' + kMaxAudio_s + ' seconds');
+                                        if (audio.length > kMaxRecording_s*kSampleRate) {
+                                            audio = audio.slice(0, kMaxRecording_s*kSampleRate);
+                                            printTextarea('js: truncated audio to first ' + kMaxRecording_s + ' seconds');
                                        }
                                        setAudio(audio);
                                    });
@ -509,24 +593,31 @@
                        });
                    }

-                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxAudio_s) + '%';
-                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxAudio_s).toFixed(0) + '%';
+                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxRecording_s) + '%';
+                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxRecording_s).toFixed(0) + '%';
                }, 1000);

                printTextarea('js: recording ...');

                setTimeout(function() {
                    if (doRecording) {
-                        printTextarea('js: recording stopped after ' + kMaxAudio_s + ' seconds');
+                        printTextarea('js: recording stopped after ' + kMaxRecording_s + ' seconds');
                        stopRecording();
                    }
-                }, kMaxAudio_s*1000);
+                }, kMaxRecording_s*1000);
            }

            //
            // transcribe
            //

+            var nthreads = 8;
+
+            function changeThreads(value) {
+                nthreads = value;
+                document.getElementById('threads-value').innerHTML = nthreads;
+            }
+
            function onProcess(translate) {
                if (!instance) {
                    instance = Module.init('whisper.bin');
@ -553,7 +644,7 @@
                    printTextarea('');

                    setTimeout(function() {
-                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, translate);
+                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, nthreads, translate);
                        console.log('js: full_default returned: ' + ret);
                        if (ret) {
                            printTextarea("js: whisper returned: " + ret);
--- a/extra/bench-all.sh
+++ b/extra/bench-all.sh
@ -2,7 +2,7 @@

 # Helper script to run the bench tool on all models and print the results in share-able format

-printf "Usage: ./bench.sh [n_threads]\n"
+printf "Usage: ./bench.sh [n_threads] [encoder-only]\n"

 if [ -z "$1" ]; then
    n_threads=4
@ -10,13 +10,27 @@ else
    n_threads=$1
 fi

-models=( "tiny" "base" "small" "medium" "large" )
+encoder_only=0
+if [ -z "$2" ]; then
+    encoder_only=0
+else
+    encoder_only=$2
+fi

+models=(                                               \
+      "tiny"   "tiny-q5_0"   "tiny-q5_1"   "tiny-q8_0" \
+      "base"   "base-q5_0"   "base-q5_1"   "base-q8_0" \
+     "small"  "small-q5_0"  "small-q5_1"  "small-q8_0" \
+    "medium" "medium-q5_0" "medium-q5_1" "medium-q8_0" \
+     "large"  "large-q5_0"  "large-q5_1"  "large-q8_0" \
+)
+
+if [ "$encoder_only" -eq 0 ]; then
    printf "\n"
-printf "Running memcpy benchmark with 1 thread\n"
+    printf "Running memcpy benchmark\n"
    printf "\n"

-./bench -w 1 -t 1 2>&1
+    ./bench -w 1 -t $n_threads 2>&1

    printf "\n"
    printf "Running ggml_mul_mat benchmark with $n_threads threads\n"
@ -28,6 +42,7 @@ printf "\n"
    printf "Running benchmark for all models\n"
    printf "This can take a while!\n"
    printf "\n"
+fi

 printf "| CPU | OS | Config | Model | Th | Load | Enc. | Commit |\n"
 printf "| --- | -- | ------ | ----- | -- | ---- | ---- | ------ |\n"
@ -39,6 +54,7 @@ for model in "${models[@]}"; do
    # actual run
    # store stderr output in a variable in order to parse it later
    output=$(./bench -m ./models/ggml-$model.bin -t $n_threads 2>&1)
+    ret=$?

    # parse the output:
    load_time=$(echo "$output" | grep "load time" | awk '{print $5}')
@ -70,5 +86,7 @@ for model in "${models[@]}"; do

    commit=$(git rev-parse --short HEAD)

+    if [ $ret -eq 0 ]; then
        printf "| <todo> | <todo> | $config | $model | $n_threads | $load_time | $encode_time | $commit |\n"
+    fi
 done
--- a/extra/quantize-all.sh
+++ b/extra/quantize-all.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+
+printf "Usage: $0 <upload>"
+
+if [ $# -ne 1 ]; then
+    printf "\nError: Invalid number of arguments\n"
+    exit 1
+fi
+
+qtype0="q5_0"
+qtype1="q5_1"
+upload="$1"
+
+cd `dirname $0`
+cd ../
+
+./quantize ./models/ggml-tiny.en.bin   ./models/ggml-tiny.en-${qtype1}.bin ${qtype1}
+./quantize ./models/ggml-tiny.bin      ./models/ggml-tiny-${qtype1}.bin    ${qtype1}
+
+./quantize ./models/ggml-base.en.bin   ./models/ggml-base.en-${qtype1}.bin ${qtype1}
+./quantize ./models/ggml-base.bin      ./models/ggml-base-${qtype1}.bin    ${qtype1}
+
+./quantize ./models/ggml-small.en.bin  ./models/ggml-small.en-${qtype1}.bin ${qtype1}
+./quantize ./models/ggml-small.bin     ./models/ggml-small-${qtype1}.bin    ${qtype1}
+
+./quantize ./models/ggml-medium.en.bin ./models/ggml-medium.en-${qtype0}.bin ${qtype0}
+./quantize ./models/ggml-medium.bin    ./models/ggml-medium-${qtype0}.bin    ${qtype0}
+
+./quantize ./models/ggml-large.bin     ./models/ggml-large-${qtype0}.bin ${qtype0}
+
+if [ "$upload" == "1" ]; then
+    scp ./models/ggml-tiny.en-${qtype1}.bin   root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny.en-${qtype1}.bin
+    scp ./models/ggml-tiny-${qtype1}.bin      root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny-${qtype1}.bin
+
+    scp ./models/ggml-base.en-${qtype1}.bin   root@linode0:/mnt/Data/ggml/ggml-model-whisper-base.en-${qtype1}.bin
+    scp ./models/ggml-base-${qtype1}.bin      root@linode0:/mnt/Data/ggml/ggml-model-whisper-base-${qtype1}.bin
+
+    scp ./models/ggml-small.en-${qtype1}.bin  root@linode0:/mnt/Data/ggml/ggml-model-whisper-small.en-${qtype1}.bin
+    scp ./models/ggml-small-${qtype1}.bin     root@linode0:/mnt/Data/ggml/ggml-model-whisper-small-${qtype1}.bin
+
+    scp ./models/ggml-medium.en-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium.en-${qtype0}.bin
+    scp ./models/ggml-medium-${qtype0}.bin    root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium-${qtype0}.bin
+
+    scp ./models/ggml-large-${qtype0}.bin     root@linode0:/mnt/Data/ggml/ggml-model-whisper-large-${qtype0}.bin
+fi
--- a/extra/sync-ggml.sh
+++ b/extra/sync-ggml.sh
@ -0,0 +1,18 @@
+#!/bin/bash
+
+cp -rpv ../ggml/src/ggml.c               ./ggml.c
+cp -rpv ../ggml/src/ggml-cuda.h          ./ggml-cuda.h
+cp -rpv ../ggml/src/ggml-cuda.cu         ./ggml-cuda.cu
+cp -rpv ../ggml/src/ggml-opencl.h        ./ggml-opencl.h
+cp -rpv ../ggml/src/ggml-opencl.c        ./ggml-opencl.c
+cp -rpv ../ggml/include/ggml/ggml.h      ./ggml.h
+cp -rpv ../ggml/examples/common.h        ./examples/common.h
+cp -rpv ../ggml/examples/common.cpp      ./examples/common.cpp
+cp -rpv ../ggml/examples/common-ggml.h   ./examples/common-ggml.h
+cp -rpv ../ggml/examples/common-ggml.cpp ./examples/common-ggml.cpp
+
+cp -rpv ../ggml/examples/whisper/whisper.h    ./whisper.h
+cp -rpv ../ggml/examples/whisper/whisper.cpp  ./whisper.cpp
+cp -rpv ../ggml/examples/whisper/main.cpp     ./examples/main/main.cpp
+cp -rpv ../ggml/examples/whisper/quantize.cpp ./examples/quantize/quantize.cpp
+
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@ -0,0 +1,898 @@
+#include <cstddef>
+#include <cstdint>
+#include <stdint.h>
+#include <stdio.h>
+#include <atomic>
+
+#include <cuda_runtime.h>
+#include <cublas_v2.h>
+#include <cuda_fp16.h>
+
+#include "ggml-cuda.h"
+#include "ggml.h"
+
+static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
+
+#define CUDA_CHECK(err)                                                                 \
+    do {                                                                                \
+        cudaError_t err_ = (err);                                                       \
+        if (err_ != cudaSuccess) {                                                      \
+            fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__,   \
+                cudaGetErrorString(err_));                                              \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+#define CUBLAS_CHECK(err)                                                               \
+    do {                                                                                \
+        cublasStatus_t err_ = (err);                                                    \
+        if (err_ != CUBLAS_STATUS_SUCCESS) {                                            \
+            fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__);    \
+            exit(1);                                                                    \
+        }                                                                               \
+    } while (0)
+
+typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, float & v0, float & v1);
+typedef void (*to_fp32_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
+typedef void (*dequantize_mul_mat_vec_cuda_t)(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream);
+
+// QK = number of values after dequantization
+// QR = QK / number of values before dequantization
+
+#define QK4_0 32
+#define QR4_0 2
+typedef struct {
+    float   d;              // delta
+    uint8_t qs[QK4_0 / 2];  // nibbles / quants
+} block_q4_0;
+static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
+
+#define QK4_1 32
+#define QR4_1 2
+typedef struct {
+    float   d;              // delta
+    float   m;              // min
+    uint8_t qs[QK4_1 / 2];  // nibbles / quants
+} block_q4_1;
+static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
+
+#define QK5_0 32
+#define QR5_0 2
+typedef struct {
+    half d;                 // delta
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_0 / 2];  // nibbles / quants
+} block_q5_0;
+static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
+
+#define QK5_1 32
+#define QR5_1 2
+typedef struct {
+    half d;                 // delta
+    half m;                 // min
+    uint8_t qh[4];          // 5-th bit of quants
+    uint8_t qs[QK5_1 / 2];  // nibbles / quants
+} block_q5_1;
+static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
+
+#define QK8_0 32
+#define QR8_0 1
+typedef struct {
+    float   d;              // delta
+    int8_t  qs[QK8_0];      // quants
+} block_q8_0;
+static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
+
+#define CUDA_DMMV_BLOCK_SIZE 32
+
+static __device__ void dequantize_q4_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const float d = x[ib].d;
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    v0 = (vi0 - 8)*d;
+    v1 = (vi1 - 8)*d;
+}
+
+static __device__ void dequantize_q4_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const float d = x[ib].d;
+    const float m = x[ib].m;
+
+    const uint8_t vui = x[ib].qs[iqs];
+
+    const int8_t vi0 = vui & 0xF;
+    const int8_t vi1 = vui >> 4;
+
+    v0 = vi0*d + m;
+    v1 = vi1*d + m;
+}
+
+static __device__ void dequantize_q5_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const float d = x[ib].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1) - 16;
+
+    v0 = x0*d;
+    v1 = x1*d;
+}
+
+static __device__ void dequantize_q5_1(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const float d = x[ib].d;
+    const float m = x[ib].m;
+
+    uint32_t qh;
+    memcpy(&qh, x[ib].qh, sizeof(qh));
+
+    const uint8_t xh_0 = ((qh >> (iqs +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (iqs + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[ib].qs[iqs] & 0xf) | xh_0);
+    const int32_t x1 = ((x[ib].qs[iqs] >>  4) | xh_1);
+
+    v0 = x0*d + m;
+    v1 = x1*d + m;
+}
+
+static __device__ void dequantize_q8_0(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const float d = x[ib].d;
+
+    const int8_t vi0 = x[ib].qs[iqs + 0];
+    const int8_t vi1 = x[ib].qs[iqs + 1];
+
+    v0 = vi0*d;
+    v1 = vi1*d;
+}
+
+static __device__ void convert_f16(const void * vx, const int ib, const int iqs, float & v0, float & v1){
+    const half * x = (const half *) vx;
+
+    v0 = __half2float(x[ib + 0]);
+    v1 = __half2float(x[ib + 1]);
+}
+
+static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
+    static const int qk = QK4_0;
+
+    const block_q4_0 * x = (const block_q4_0 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+
+    for (int j = 0; j < qk/2; ++j) {
+        const int x0 = (x[i].qs[j] & 0xf) - 8;
+        const int x1 = (x[i].qs[j] >>  4) - 8;
+
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
+    }
+}
+
+static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
+    static const int qk = QK4_1;
+
+    const block_q4_1 * x = (const block_q4_1 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    for (int j = 0; j < qk/2; ++j) {
+        const int x0 = (x[i].qs[j] & 0xf);
+        const int x1 = (x[i].qs[j] >>  4);
+
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
+    }
+}
+
+static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
+    static const int qk = QK5_0;
+
+    const block_q5_0 * x = (const block_q5_0 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+
+    uint32_t qh;
+    memcpy(&qh, x[i].qh, sizeof(qh));
+
+    for (int j = 0; j < qk/2; ++j) {
+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+        const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+        const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
+
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
+    }
+}
+
+static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
+    static const int qk = QK5_1;
+
+    const block_q5_1 * x = (const block_q5_1 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    uint32_t qh;
+    memcpy(&qh, x[i].qh, sizeof(qh));
+
+    for (int j = 0; j < qk/2; ++j) {
+        const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+        const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+        const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+        const int x1 = (x[i].qs[j] >>  4) | xh_1;
+
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
+    }
+}
+
+static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
+    static const int qk = QK8_0;
+
+    const block_q8_0 * x = (const block_q8_0 *) vx;
+
+    const int i = blockIdx.x;
+
+    const float d = x[i].d;
+
+    for (int j = 0; j < qk; ++j) {
+        y[i*qk + j] = x[i].qs[j]*d;
+    }
+}
+
+template <int block_size, int qk, int qr, dequantize_kernel_t dequantize_kernel>
+static __global__ void dequantize_mul_mat_vec(const void * vx, const float * y, float * dst, const int ncols) {
+    const int row = blockIdx.x;
+    const int tid = threadIdx.x;
+
+    const int y_offset = qr == 1 ? 1 : qk/2;
+
+    __shared__ float tmp[block_size]; // separate sum for each thread
+    tmp[tid] = 0;
+
+    for (int i = 0; i < ncols/block_size; i += 2) {
+        const int col = i*block_size + 2*tid;
+        const int ib = (row*ncols + col)/qk; // block index
+        const int iqs = (col%qk)/qr; // quant index
+        const int iybs = col - col%qk; // y block start index
+
+        // dequantize
+        float v0, v1;
+        dequantize_kernel(vx, ib, iqs, v0, v1);
+
+        // matrix multiplication
+        tmp[tid] += v0 * y[iybs + iqs + 0];
+        tmp[tid] += v1 * y[iybs + iqs + y_offset];
+    }
+
+    // sum up partial sums and write back result
+    __syncthreads();
+    for (int s=block_size/2; s>0; s>>=1) {
+        if (tid < s) {
+            tmp[tid] += tmp[tid + s];
+        }
+        __syncthreads();
+    }
+    if (tid == 0) {
+        dst[row] = tmp[0];
+    }
+}
+
+static void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK4_0;
+    dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+static void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK4_1;
+    dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+static void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK5_0;
+    dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+static void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK5_1;
+    dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+static void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
+    const int nb = k / QK8_0;
+    dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
+}
+
+static void dequantize_mul_mat_vec_q4_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_0, QR4_0, dequantize_q4_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q4_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK4_1, QR4_1, dequantize_q4_1>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q5_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_0, QR5_0, dequantize_q5_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q5_1_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK5_1, QR5_1, dequantize_q5_1>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static void dequantize_mul_mat_vec_q8_0_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, QK8_0, QR8_0, dequantize_q8_0>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+// TODO: optimize
+static __global__ void convert_fp16_to_fp32(const void * vx, float * y) {
+    const half * x = (const half *) vx;
+
+    const int i = blockIdx.x;
+
+    y[i] = __half2float(x[i]);
+}
+
+static void convert_fp16_to_fp32_cuda(const void * x, float * y, int k, cudaStream_t stream) {
+    convert_fp16_to_fp32<<<k, 1, 0, stream>>>(x, y);
+}
+
+static void convert_mul_mat_vec_f16_cuda(const void * vx, const float * y, float * dst, const int ncols, const int nrows, cudaStream_t stream) {
+    GGML_ASSERT(ncols % CUDA_DMMV_BLOCK_SIZE == 0);
+    dequantize_mul_mat_vec<CUDA_DMMV_BLOCK_SIZE, 32, 1, convert_f16>
+        <<<nrows, CUDA_DMMV_BLOCK_SIZE, 0, stream>>>(vx, y, dst, ncols);
+}
+
+static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_row_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_row_q4_1_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_row_q5_0_cuda;
+        case GGML_TYPE_Q5_1:
+            return dequantize_row_q5_1_cuda;
+        case GGML_TYPE_Q8_0:
+            return dequantize_row_q8_0_cuda;
+        case GGML_TYPE_F16:
+            return convert_fp16_to_fp32_cuda;
+        default:
+            return nullptr;
+    }
+}
+
+static dequantize_mul_mat_vec_cuda_t ggml_get_dequantize_mul_mat_vec_cuda(ggml_type type) {
+    switch (type) {
+        case GGML_TYPE_Q4_0:
+            return dequantize_mul_mat_vec_q4_0_cuda;
+        case GGML_TYPE_Q4_1:
+            return dequantize_mul_mat_vec_q4_1_cuda;
+        case GGML_TYPE_Q5_0:
+            return dequantize_mul_mat_vec_q5_0_cuda;
+        case GGML_TYPE_Q5_1:
+            return dequantize_mul_mat_vec_q5_1_cuda;
+        case GGML_TYPE_Q8_0:
+            return dequantize_mul_mat_vec_q8_0_cuda;
+        case GGML_TYPE_F16:
+            return convert_mul_mat_vec_f16_cuda;
+        default:
+            return nullptr;
+    }
+}
+
+// buffer pool for cuda
+#define MAX_CUDA_BUFFERS 256
+
+struct scoped_spin_lock {
+    std::atomic_flag& lock;
+    scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
+        while (lock.test_and_set(std::memory_order_acquire)) {
+            ; // spin
+        }
+    }
+    ~scoped_spin_lock() {
+        lock.clear(std::memory_order_release);
+    }
+    scoped_spin_lock(const scoped_spin_lock&) = delete;
+    scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
+};
+
+struct cuda_buffer {
+    void * ptr = nullptr;
+    size_t size = 0;
+};
+
+static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
+static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+
+static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[i];
+        if (b.size >= size && b.ptr != nullptr) {
+            void * ptr = b.ptr;
+            *actual_size = b.size;
+            b.ptr = nullptr;
+            b.size = 0;
+            return ptr;
+        }
+    }
+    void * ptr;
+    CUDA_CHECK(cudaMalloc((void **) &ptr, size));
+    *actual_size = size;
+    return ptr;
+}
+
+static void ggml_cuda_pool_free(void * ptr, size_t size) {
+    scoped_spin_lock lock(g_cuda_pool_lock);
+
+    for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
+        cuda_buffer& b = g_cuda_buffer_pool[i];
+        if (b.ptr == nullptr) {
+            b.ptr = ptr;
+            b.size = size;
+            return;
+        }
+    }
+    fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
+    CUDA_CHECK(cudaFree(ptr));
+}
+
+#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
+#define GGML_CUDA_MAX_EVENTS 64
+static cublasHandle_t g_cublasH = nullptr;
+static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_STREAMS] = { nullptr };
+static cudaStream_t g_cudaStreams2[GGML_CUDA_MAX_STREAMS] = { nullptr };
+static cudaEvent_t g_cudaEvents[GGML_CUDA_MAX_EVENTS] = { nullptr };
+
+void ggml_init_cublas() {
+    if (g_cublasH == nullptr) {
+        // create streams
+        for (int i = 0; i < GGML_CUDA_MAX_STREAMS; ++i) {
+            CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[i], cudaStreamNonBlocking));
+            CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams2[i], cudaStreamNonBlocking));
+        }
+        // create events
+        for (int i = 0; i < GGML_CUDA_MAX_EVENTS; ++i) {
+            CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvents[i], cudaEventDisableTiming));
+        }
+
+        // create cublas handle
+        CUBLAS_CHECK(cublasCreate(&g_cublasH));
+        CUBLAS_CHECK(cublasSetMathMode(g_cublasH, CUBLAS_TF32_TENSOR_OP_MATH));
+
+        // configure logging to stdout
+        // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
+    }
+}
+
+void * ggml_cuda_host_malloc(size_t size) {
+    if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
+        return nullptr;
+    }
+
+    void * ptr = nullptr;
+    cudaError_t err = cudaMallocHost((void **) &ptr, size);
+    if (err != cudaSuccess) {
+        fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n",
+            size/1024.0/1024.0, cudaGetErrorString(err));
+        return nullptr;
+    }
+
+    return ptr;
+}
+
+void ggml_cuda_host_free(void * ptr) {
+    CUDA_CHECK(cudaFreeHost(ptr));
+}
+
+static cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
+    const uint64_t ne0 = src->ne[0];
+    const uint64_t ne1 = src->ne[1];
+    const uint64_t nb0 = src->nb[0];
+    const uint64_t nb1 = src->nb[1];
+    const uint64_t nb2 = src->nb[2];
+    const uint64_t nb3 = src->nb[3];
+    const enum ggml_type type = src->type;
+    const size_t ts = ggml_type_size(type);
+    const size_t bs = ggml_blck_size(type);
+
+    const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
+    if (nb0 == ts && nb1 == ts*ne0/bs) {
+        return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream);
+    } else if (nb0 == ts) {
+        return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream);
+    } else {
+        for (uint64_t i1 = 0; i1 < ne1; i1++) {
+            const void * rx = (const void *) ((const char *) x + i1*nb1);
+            void * rd = (void *) ((char *) dst + i1*ts*ne0/bs);
+            // pretend the row is a matrix with cols=1
+            cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
+            if (r != cudaSuccess) return r;
+        }
+        return cudaSuccess;
+    }
+}
+
+static void ggml_cuda_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int n_mm = ne03 * ne02;
+
+    size_t x_size, y_size, d_size;
+    float * d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
+    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            int i = i03*ne02 + i02;
+            cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
+
+            float * c_X = d_X + i * x_ne;
+            float * c_Y = d_Y + i * y_ne;
+            float * c_D = d_D + i * d_ne;
+
+            // copy data to device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+
+            // compute
+            CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+            CUBLAS_CHECK(
+                cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                        ne01, ne11, ne10,
+                        &alpha, c_X, ne00,
+                                c_Y, ne10,
+                        &beta,  c_D, ne01));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+        }
+    }
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+    ggml_cuda_pool_free(d_X, x_size);
+    ggml_cuda_pool_free(d_Y, y_size);
+    ggml_cuda_pool_free(d_D, d_size);
+}
+
+static void ggml_cuda_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t /* wsize */) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb10 = src1->nb[0];
+    const int nb11 = src1->nb[1];
+    const int nb12 = src1->nb[2];
+    const int nb13 = src1->nb[3];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int n_mm = ne03 * ne02;
+
+    size_t x_size, y_size, d_size;
+    half  * d_X =  (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * x_ne, &x_size);
+    half  * d_Y =  (half *) ggml_cuda_pool_malloc(n_mm * sizeof(half) * y_ne, &y_size);
+    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
+
+    bool src1_cont_rows = nb10 == sizeof(float);
+    bool src1_cont_cols = (size_t)nb11 == ne11*sizeof(float);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            int i = i03*ne02 + i02;
+            cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
+
+            half  * c_X = d_X + i * x_ne;
+            half  * c_Y = d_Y + i * y_ne;
+            float * c_D = d_D + i * d_ne;
+
+            // copy src0 to device
+            CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_X, src0, i03, i02, cudaStream));
+
+            // convert src1 to fp16
+            // TODO: use multiple threads
+            ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
+            char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
+            if (src1_cont_rows) {
+                if (src1_cont_cols) {
+                    ggml_fp32_to_fp16_row((float *) src1i, tmp, ne10*ne11);
+                }
+                else {
+                    for (int64_t i01 = 0; i01 < ne11; i01++) {
+                        ggml_fp32_to_fp16_row((float *) (src1i + i01*nb11), tmp + i01*ne10, ne10);
+                    }
+                }
+            }
+            else {
+                for (int64_t i01 = 0; i01 < ne11; i01++) {
+                    for (int64_t i00 = 0; i00 < ne10; i00++) {
+                        // very slow due to no inlining
+                        tmp[i01*ne10 + i00] = ggml_fp32_to_fp16(*(float *) (src1i + i01*nb11 + i00*nb10));
+                    }
+                }
+            }
+
+            // copy src1 to device
+            CUDA_CHECK(cudaMemcpyAsync(c_Y, tmp, sizeof(half) * y_ne, cudaMemcpyHostToDevice, cudaStream));
+
+            // compute
+            CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+            CUBLAS_CHECK(
+                cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                        ne01, ne11, ne10,
+                        &alpha, c_X, CUDA_R_16F, ne00,
+                                c_Y, CUDA_R_16F, ne10,
+                        &beta,  c_D, CUDA_R_32F, ne01,
+                        CUBLAS_COMPUTE_32F_FAST_16F,
+                        CUBLAS_GEMM_DEFAULT));
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+        }
+    }
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+    ggml_cuda_pool_free(d_X, x_size);
+    ggml_cuda_pool_free(d_Y, y_size);
+    ggml_cuda_pool_free(d_D, d_size);
+}
+
+static void ggml_cuda_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+    const int64_t ne00 = src0->ne[0];
+    const int64_t ne01 = src0->ne[1];
+    const int64_t ne02 = src0->ne[2];
+    const int64_t ne03 = src0->ne[3];
+
+    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
+
+    const int nb2  = dst->nb[2];
+    const int nb3  = dst->nb[3];
+    const ggml_type type = src0->type;
+    const bool mul_mat_vec = ne11 == 1;
+
+    const float alpha = 1.0f;
+    const float beta = 0.0f;
+    const int x_ne = ne01 * ne00;
+    const int y_ne = ne11 * ne10;
+    const int d_ne = ne11 * ne01;
+    const int n_mm = ne03 * ne02;
+    const size_t q_sz = ggml_type_size(type) * x_ne / ggml_blck_size(type);
+
+    size_t x_size, y_size, d_size, q_size;
+    float * d_X = nullptr;
+    if (!mul_mat_vec) {
+        d_X = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * x_ne, &x_size);
+    }
+    float * d_Y = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * y_ne, &y_size);
+    float * d_D = (float *) ggml_cuda_pool_malloc(n_mm * sizeof(float) * d_ne, &d_size);
+    char  * d_Q = (char  *) ggml_cuda_pool_malloc(n_mm * q_sz, &q_size);
+
+    const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(type);
+    dequantize_mul_mat_vec_cuda_t dmmv = ggml_get_dequantize_mul_mat_vec_cuda(type);
+    GGML_ASSERT(to_fp32_cuda != nullptr);
+
+    for (int64_t i03 = 0; i03 < ne03; i03++) {
+        for (int64_t i02 = 0; i02 < ne02; i02++) {
+            int i = i03*ne02 + i02;
+            cudaStream_t cudaStream = g_cudaStreams[i % GGML_CUDA_MAX_STREAMS];
+            cudaStream_t cudaStream2 = g_cudaStreams2[i % GGML_CUDA_MAX_STREAMS];
+            cudaEvent_t  cudaEvent = g_cudaEvents[i % GGML_CUDA_MAX_EVENTS];
+
+            float * c_Y = d_Y + i * y_ne;
+            float * c_D = d_D + i * d_ne;
+            char  * c_Q = d_Q + i * q_sz;
+
+            // copy src0 to device if necessary
+            if (src0->backend == GGML_BACKEND_CPU) {
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Q, src0, i03, i02, cudaStream2));
+            } else if (src0->backend == GGML_BACKEND_CUDA) {
+                c_Q = ((char *) src0->data) + i * q_sz;
+            } else {
+                GGML_ASSERT(false);
+            }
+            if (mul_mat_vec) { // specialized dequantize_mul_mat_vec kernel
+                CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
+
+                // copy src1 to device
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+
+                // wait for data
+                CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
+
+                // compute
+                dmmv(c_Q, c_Y, c_D, ne00, ne01, cudaStream);
+                CUDA_CHECK(cudaGetLastError());
+
+            } else { // general dequantization kernel + cuBLAS matrix matrix multiplication
+                float * c_X = d_X + i * x_ne;
+
+                // convert src0 to fp32 on device
+                to_fp32_cuda(c_Q, c_X, x_ne, cudaStream2);
+                CUDA_CHECK(cudaGetLastError());
+                CUDA_CHECK(cudaEventRecord(cudaEvent, cudaStream2));
+
+                // copy src1 to device
+                CUDA_CHECK(ggml_cuda_h2d_tensor_2d(c_Y, src1, i03, i02, cudaStream));
+
+                // wait for conversion
+                CUDA_CHECK(cudaStreamWaitEvent(cudaStream, cudaEvent, 0));
+
+                // compute
+                CUBLAS_CHECK(cublasSetStream(g_cublasH, cudaStream));
+                CUBLAS_CHECK(
+                    cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
+                            ne01, ne11, ne10,
+                            &alpha, c_X, ne00,
+                                    c_Y, ne10,
+                            &beta,  c_D, ne01));
+            }
+
+            // copy dst to host
+            float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
+            CUDA_CHECK(cudaMemcpyAsync(d, c_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, cudaStream));
+        }
+    }
+
+    CUDA_CHECK(cudaDeviceSynchronize());
+    if (!mul_mat_vec) {
+        ggml_cuda_pool_free(d_X, x_size);
+    }
+    ggml_cuda_pool_free(d_Y, y_size);
+    ggml_cuda_pool_free(d_D, d_size);
+    ggml_cuda_pool_free(d_Q, q_size);
+}
+
+bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    const int64_t ne10 = src1->ne[0];
+
+    const int64_t ne0 = dst->ne[0];
+    const int64_t ne1 = dst->ne[1];
+
+    // TODO: find the optimal values for these
+    if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
+        src1->type == GGML_TYPE_F32 &&
+        dst->type == GGML_TYPE_F32 &&
+        ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CUDA)) {
+        return true;
+    }
+
+    return false;
+}
+
+bool ggml_cuda_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * /* dst */) {
+    size_t src0_sz = ggml_nbytes(src0);
+    size_t src1_sz = ggml_nbytes(src1);
+
+    // mul_mat_q: src0 is converted to fp32 on device
+    size_t mul_mat_q_transfer = src0_sz + src1_sz;
+
+    // mul_mat_f16: src1 is converted to fp16 on cpu
+    size_t mul_mat_f16_transfer = src0_sz + sizeof(half) * ggml_nelements(src1);
+
+    // choose the smaller one to transfer to the device
+    // TODO: this is not always the best choice due to the overhead of converting to fp16
+    return mul_mat_f16_transfer < mul_mat_q_transfer;
+}
+
+void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) {
+    GGML_ASSERT(ggml_cuda_can_mul_mat(src0, src1, dst));
+
+    if (src0->type == GGML_TYPE_F32) {
+        ggml_cuda_mul_mat_f32(src0, src1, dst);
+    }
+    else if (src0->type == GGML_TYPE_F16) {
+        if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
+            ggml_cuda_mul_mat_f16(src0, src1, dst, wdata, wsize);
+        }
+        else {
+            ggml_cuda_mul_mat_q_f32(src0, src1, dst);
+        }
+    }
+    else if (ggml_is_quantized(src0->type)) {
+        ggml_cuda_mul_mat_q_f32(src0, src1, dst);
+    }
+    else {
+        GGML_ASSERT(false);
+    }
+}
+
+size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
+    if (ggml_cuda_mul_mat_use_f16(src0, src1, dst)) {
+        return ggml_nelements(src1) * sizeof(ggml_fp16_t);
+    }
+    else {
+        return 0;
+    }
+}
+
+void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
+    const int64_t ne0 = tensor->ne[0];
+    const int64_t ne1 = tensor->ne[1];
+    const int64_t ne2 = tensor->ne[2];
+    const int64_t ne3 = tensor->ne[3];
+
+    const ggml_type type = tensor->type;
+    const size_t q_sz = ggml_type_size(type) * ne0 * ne1 * ne2 * ne3 / ggml_blck_size(type);
+
+    size_t q_size;
+    char * d_Q = (char *) ggml_cuda_pool_malloc(q_sz, &q_size);
+
+    cudaStream_t cudaStream2 = g_cudaStreams2[0];
+
+    // copy tensor to device
+    CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Q, tensor, 0, 0, cudaStream2));
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    tensor->data = d_Q;
+    tensor->backend = GGML_BACKEND_CUDA;
+}
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@ -0,0 +1,21 @@
+#include "ggml.h"
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void   ggml_init_cublas(void);
+
+bool   ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+size_t ggml_cuda_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
+void   ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
+
+// TODO: export these with GGML_API
+void * ggml_cuda_host_malloc(size_t size);
+void   ggml_cuda_host_free(void * ptr);
+
+void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@ -0,0 +1,361 @@
+#include "ggml-opencl.h"
+
+#define CL_TARGET_OPENCL_VERSION 110
+#include <clblast_c.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "ggml.h"
+
+#define MULTILINE_QUOTE(...) #__VA_ARGS__
+const char * clblast_dequant = MULTILINE_QUOTE(
+
+typedef uchar uint8_t;
+typedef int int32_t;
+typedef uint uint32_t;
+
+constant uint QK4_0 = 32;
+struct block_q4_0
+{
+    float d;
+    uint8_t qs[QK4_0 / 2];
+};
+
+constant uint QK4_1 = 32;
+struct block_q4_1
+{
+    float d;
+    float m;
+    uint8_t qs[QK4_1 / 2];
+};
+
+constant uint QK5_0 = 32;
+struct __attribute__ ((packed)) block_q5_0
+{
+    half d;
+    uint32_t qh;
+    uint8_t qs[QK5_0 / 2];
+};
+
+constant uint QK5_1 = 32;
+struct block_q5_1
+{
+    half d;
+    half m;
+    uint32_t qh;
+    uint8_t qs[QK5_1 / 2];
+};
+
+constant uint QK8_0 = 32;
+struct block_q8_0
+{
+    float d;
+    uint8_t qs[QK8_0];
+};
+
+
+__kernel void dequantize_row_q4_0(__global struct block_q4_0* x, __global float* y) {
+    constant uint qk = QK4_0;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+
+    const int x0 = (x[i].qs[j] & 0xf) - 8;
+    const int x1 = (x[i].qs[j] >>  4) - 8;
+
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
+}
+
+__kernel void dequantize_row_q4_1(__global struct block_q4_1* x, __global float* y) {
+    constant uint qk = QK4_1;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+    const float m = x[i].m;
+
+    const int x0 = (x[i].qs[j] & 0xf);
+    const int x1 = (x[i].qs[j] >>  4);
+
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
+}
+
+__kernel void dequantize_row_q5_0(__global struct block_q5_0* x, __global float* y) {
+    constant uint qk = QK5_0;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &x[i].d);
+
+    uint32_t qh = x[i].qh;
+
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+    const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
+    const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
+
+    y[i*qk + j + 0   ] = x0*d;
+    y[i*qk + j + qk/2] = x1*d;
+}
+
+__kernel void dequantize_row_q5_1(__global struct block_q5_1* x, __global float* y) {
+    constant uint qk = QK5_1;
+
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = vload_half(0, (__global half*) &x[i].d);
+    const float m = vload_half(0, (__global half*) &x[i].m);
+
+    uint32_t qh = x[i].qh;
+
+    const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+    const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
+
+    const int x0 = (x[i].qs[j] & 0xf) | xh_0;
+    const int x1 = (x[i].qs[j] >>  4) | xh_1;
+
+    y[i*qk + j + 0   ] = x0*d + m;
+    y[i*qk + j + qk/2] = x1*d + m;
+}
+
+__kernel void dequantize_row_q8_0(__global struct block_q8_0* x, __global float* y) {
+    constant uint qk = QK8_0;
+    const uint i = get_global_id(0) / qk;
+    const uint j = get_local_id(0);
+
+    const float d = x[i].d;
+    y[i*qk + j] = x[i].qs[j]*d;
+}
+
+);
+
+#define CL_CHECK(err, name)                                                                     \
+    do {                                                                                        \
+        cl_int err_ = (err);                                                                    \
+        if (err_ != CL_SUCCESS) {                                                               \
+            fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__);   \
+            exit(1);                                                                            \
+        }                                                                                       \
+    } while (0)
+
+static cl_platform_id platform;
+static cl_device_id device;
+static cl_context context;
+static cl_command_queue queue;
+static cl_program program;
+static cl_kernel kernel_q4_0, kernel_q4_1, kernel_q5_0, kernel_q5_1, kernel_q8_0;
+static cl_mem cl_buffer_a, cl_buffer_qb, cl_buffer_b, cl_buffer_c;
+static size_t cl_size_a = 0, cl_size_qb = 0, cl_size_b = 0, cl_size_c = 0;
+
+static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, const char* program_buffer) {
+    cl_program p;
+    char *program_log;
+    size_t program_size, log_size;
+    int err;
+
+    program_size = strlen(program_buffer);
+
+    p = clCreateProgramWithSource(ctx, 1, (const char**)&program_buffer, &program_size, &err);
+    if(err < 0) {
+        fprintf(stderr, "OpenCL error creating program");
+        exit(1);
+    }
+
+    err = clBuildProgram(p, 0, NULL, NULL, NULL, NULL);
+    if(err < 0) {
+
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
+        program_log = (char*) malloc(log_size + 1);
+        program_log[log_size] = '\0';
+        clGetProgramBuildInfo(p, dev, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL);
+        printf("%s\n", program_log);
+        free(program_log);
+        exit(1);
+    }
+
+    return p;
+}
+
+void ggml_cl_init(void) {
+    cl_int err = 0;
+    char * GGML_CLBLAST_PLATFORM = getenv("GGML_CLBLAST_PLATFORM");
+    char * GGML_CLBLAST_DEVICE = getenv("GGML_CLBLAST_DEVICE");
+    int plat_num = (GGML_CLBLAST_PLATFORM == NULL ? 0 : atoi(GGML_CLBLAST_PLATFORM));
+    int dev_num = (GGML_CLBLAST_DEVICE == NULL ? 0 : atoi(GGML_CLBLAST_DEVICE));
+    printf("\nInitializing CLBlast (First Run)...");
+    printf("\nAttempting to use: Platform=%d, Device=%d (If invalid, program will crash)\n",plat_num,dev_num);
+    cl_uint num_platforms;
+    clGetPlatformIDs(0, NULL, &num_platforms);
+    cl_platform_id* platforms = (cl_platform_id*)malloc(num_platforms*sizeof(cl_platform_id));
+    clGetPlatformIDs(num_platforms, platforms, NULL);
+    platform = platforms[plat_num];
+    char platform_buffer[1024];
+    clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_buffer), &platform_buffer, NULL);
+    cl_uint num_devices;
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 0, NULL, &num_devices);
+    cl_device_id* devices = (cl_device_id*)malloc(num_devices*sizeof(cl_device_id));
+    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, num_devices, devices, NULL);
+    device = devices[dev_num];
+    char device_buffer[1024];
+    clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_buffer), &device_buffer, NULL);
+    printf("Using Platform: %s Device: %s\n", platform_buffer, device_buffer);
+    context = clCreateContext(NULL, 1, &device, NULL, NULL, &err);
+    CL_CHECK(err, "clCreateContext");
+    queue = clCreateCommandQueue(context, device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &err);
+    CL_CHECK(err, "clCreateCommandQueue");
+
+    free(platforms);
+    free(devices);
+
+    program = build_program_from_source(context, device, clblast_dequant);
+
+    // Prepare dequantize kernels
+    kernel_q4_0 = clCreateKernel(program, "dequantize_row_q4_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q4_1 = clCreateKernel(program, "dequantize_row_q4_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q5_0 = clCreateKernel(program, "dequantize_row_q5_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q5_1 = clCreateKernel(program, "dequantize_row_q5_1", &err);
+    CL_CHECK(err, "clCreateKernel");
+    kernel_q8_0 = clCreateKernel(program, "dequantize_row_q8_0", &err);
+    CL_CHECK(err, "clCreateKernel");
+}
+
+static void ggml_cl_malloc(size_t req_size, size_t* cur_size, cl_mem_flags flags, cl_mem* buf) {
+    if (req_size <= *cur_size) {
+        return;
+    }
+
+    // Reallocate buffer with enough space
+    if (*cur_size > 0) {
+        clReleaseMemObject(*buf);
+    }
+    cl_int err;
+    *buf = clCreateBuffer(context, flags, req_size, NULL, &err);
+    *cur_size = req_size;
+    CL_CHECK(err, "clCreateBuffer");
+}
+
+void ggml_cl_sgemm_wrapper(
+        const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b,
+        const int m, const int n, const int k,
+        const float alpha, const void *host_a, const int lda,
+        const float *host_b, const int ldb, const float beta,
+        float *host_c, const int ldc, const int btype) {
+    cl_int err = 0;
+
+    cl_kernel kernel;
+    size_t global = n * k, local, size_qb;
+    bool dequant;
+
+    switch (btype) {
+    case GGML_TYPE_F32:
+        dequant = false;
+        break;
+    case GGML_TYPE_Q4_0:
+        dequant = true;
+        kernel = kernel_q4_0;
+        local = 16;
+        size_qb = global * (sizeof(float) + local) / 32;
+        break;
+    case GGML_TYPE_Q4_1:
+        dequant = true;
+        kernel = kernel_q4_1;
+        local = 16;
+        size_qb = global * (sizeof(float) * 2 + local) / 32;
+        break;
+    case GGML_TYPE_Q5_0:
+        dequant = true;
+        kernel = kernel_q5_0;
+        local = 16;
+        size_qb = global * (sizeof(ggml_fp16_t) + sizeof(uint32_t) + local) / 32;
+        break;
+    case GGML_TYPE_Q5_1:
+        dequant = true;
+        kernel = kernel_q5_1;
+        local = 16;
+        size_qb = global * (sizeof(ggml_fp16_t) * 2 + sizeof(uint32_t) + local) / 32;
+        break;
+    case GGML_TYPE_Q8_0:
+        dequant = true;
+        kernel = kernel_q8_0;
+        local = 32;
+        size_qb = global * (sizeof(float) + local) / 32;
+        break;
+    default:
+        fprintf(stderr, "Error: Unsupported OpenCL btype %d\n", btype);
+        abort();
+    }
+
+    const size_t size_a =  m * k * sizeof(float);
+    const size_t size_b =  n * k * sizeof(float);
+    const size_t size_c =  m * n * sizeof(float);
+
+    // Prepare buffers
+    ggml_cl_malloc(size_a, &cl_size_a, CL_MEM_READ_ONLY, &cl_buffer_a);
+    if (dequant) {
+        ggml_cl_malloc(size_qb, &cl_size_qb, CL_MEM_READ_ONLY, &cl_buffer_qb);
+    }
+    ggml_cl_malloc(size_b, &cl_size_b, CL_MEM_READ_WRITE, &cl_buffer_b);
+    ggml_cl_malloc(size_c, &cl_size_c, CL_MEM_WRITE_ONLY, &cl_buffer_c);
+
+    cl_event ev_a, ev_qb, ev_b;
+
+    if (dequant) {
+        err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &cl_buffer_qb);
+        err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &cl_buffer_b);
+        CL_CHECK(err, "clSetKernelArg");
+        err = clEnqueueWriteBuffer(queue, cl_buffer_qb, CL_FALSE, 0, size_qb, host_b, 0, NULL, &ev_qb);
+        CL_CHECK(err, "clEnqueueWriteBuffer qb");
+    } else {
+        err = clEnqueueWriteBuffer(queue, cl_buffer_b, CL_FALSE, 0, size_b, host_b, 0, NULL, &ev_b);
+        CL_CHECK(err, "clEnqueueWriteBuffer b");
+    }
+
+    err = clEnqueueWriteBuffer(queue, cl_buffer_a, CL_FALSE, 0, size_a, host_a, 0, NULL, &ev_a);
+    CL_CHECK(err, "clEnqueueWriteBuffer a");
+    if (dequant) {
+        err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 1, &ev_qb, &ev_b);
+        CL_CHECK(err, "clEnqueueNDRangeKernel");
+        clReleaseEvent(ev_qb);
+    }
+    clWaitForEvents(1, &ev_a);
+    clWaitForEvents(1, &ev_b);
+    clReleaseEvent(ev_a);
+    clReleaseEvent(ev_b);
+
+    cl_event ev_sgemm;
+    CLBlastStatusCode status = CLBlastSgemm((CLBlastLayout)order,
+                                            (CLBlastTranspose)trans_a, (CLBlastTranspose)trans_b,
+                                            m, n, k,
+                                            alpha,
+                                            cl_buffer_a, 0, lda,
+                                            cl_buffer_b, 0, ldb,
+                                            beta,
+                                            cl_buffer_c, 0, ldc,
+                                            &queue, &ev_sgemm);
+
+    if (status != CLBlastSuccess) {
+        fprintf(stderr, "Error: CLBlast SGEMM %d\n", status);
+        abort();
+    }
+
+    cl_event ev_c;
+    clEnqueueReadBuffer(queue, cl_buffer_c, CL_TRUE, 0, size_c, host_c, 1, &ev_sgemm, &ev_c);
+
+    // Wait for completion
+    clWaitForEvents(1, &ev_c);
+    clReleaseEvent(ev_sgemm);
+    clReleaseEvent(ev_c);
+}
--- a/ggml-opencl.h
+++ b/ggml-opencl.h
@ -0,0 +1,24 @@
+#pragma once
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
+void ggml_cl_init(void);
+
+enum ggml_blas_order {
+    GGML_BLAS_ORDER_ROW_MAJOR = 101,
+    GGML_BLAS_ORDER_COLUMN_MAJOR = 102,
+};
+
+enum ggml_blas_op {
+    GGML_BLAS_OP_N = 111,
+    GGML_BLAS_OP_T = 112,
+    GGML_BLAS_OP_C = 113,
+};
+
+void ggml_cl_sgemm_wrapper(const enum ggml_blas_order order, const enum ggml_blas_op trans_a, const enum ggml_blas_op trans_b, const int m, const int n, const int k, const float alpha, const void *host_a, const int lda, const float *host_b, const int ldb, const float beta, float *host_c, const int ldc, const int btype);
+
+#ifdef  __cplusplus
+}
+#endif
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -169,21 +169,49 @@
 //
 //

-#ifdef  __cplusplus
-extern "C" {
+#ifdef GGML_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef GGML_BUILD
+#            define GGML_API __declspec(dllexport)
+#        else
+#            define GGML_API __declspec(dllimport)
+#        endif
+#    else
+#        define GGML_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define GGML_API
 #endif

 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>

+#define GGML_FILE_MAGIC   0x67676d6c // "ggml"
+#define GGML_FILE_VERSION 1
+
+#define GGML_QNT_VERSION        1    // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
+
 #define GGML_MAX_DIMS          4
 #define GGML_MAX_NODES         4096
-#define GGML_MAX_PARAMS        16
+#define GGML_MAX_PARAMS        256
 #define GGML_MAX_CONTEXTS      64
 #define GGML_MAX_OPT           4
 #define GGML_DEFAULT_N_THREADS 4

+#define GGML_ASSERT(x) \
+    do { \
+        if (!(x)) { \
+            fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+            abort(); \
+        } \
+    } while (0)
+
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
 #ifdef __ARM_NEON
    // we use the built-in 16-bit float type
    typedef __fp16 ggml_fp16_t;
@ -192,36 +220,66 @@ typedef uint16_t ggml_fp16_t;
 #endif

    // convert FP16 <-> FP32
-float       ggml_fp16_to_fp32(ggml_fp16_t x);
-ggml_fp16_t ggml_fp32_to_fp16(float x);
+    GGML_API float       ggml_fp16_to_fp32(ggml_fp16_t x);
+    GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
+
+    GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n);
+    GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n);

    struct ggml_object;
    struct ggml_context;

    enum ggml_type {
-    // explicitly numbered values are used in llama.cpp files
        GGML_TYPE_F32  = 0,
        GGML_TYPE_F16  = 1,
        GGML_TYPE_Q4_0 = 2,
        GGML_TYPE_Q4_1 = 3,
+        // GGML_TYPE_Q4_2 = 4, support has been removed
+        // GGML_TYPE_Q4_3 (5) support has been removed
+        GGML_TYPE_Q5_0 = 6,
+        GGML_TYPE_Q5_1 = 7,
+        GGML_TYPE_Q8_0 = 8,
+        GGML_TYPE_Q8_1 = 9,
        GGML_TYPE_I8,
        GGML_TYPE_I16,
        GGML_TYPE_I32,
        GGML_TYPE_COUNT,
    };

+    enum ggml_backend {
+        GGML_BACKEND_CPU = 0,
+        GGML_BACKEND_CUDA = 1,
+    };
+
+    // model file types
+    enum ggml_ftype {
+        GGML_FTYPE_UNKNOWN     = -1,
+        GGML_FTYPE_ALL_F32     = 0,
+        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+    };
+
    // available tensor operations:
    enum ggml_op {
        GGML_OP_NONE = 0,

        GGML_OP_DUP,
        GGML_OP_ADD,
+        GGML_OP_ADD1,
+        GGML_OP_ACC,
        GGML_OP_SUB,
        GGML_OP_MUL,
        GGML_OP_DIV,
        GGML_OP_SQR,
        GGML_OP_SQRT,
+        GGML_OP_LOG,
        GGML_OP_SUM,
+        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
        GGML_OP_REPEAT,
        GGML_OP_ABS,
@ -231,12 +289,15 @@ enum ggml_op {
        GGML_OP_RELU,
        GGML_OP_GELU,
        GGML_OP_SILU,
+        GGML_OP_SILU_BACK,
        GGML_OP_NORM, // normalize
        GGML_OP_RMS_NORM,
+        GGML_OP_RMS_NORM_BACK,

        GGML_OP_MUL_MAT,

        GGML_OP_SCALE,
+        GGML_OP_SET,
        GGML_OP_CPY,
        GGML_OP_CONT,
        GGML_OP_RESHAPE,
@ -244,9 +305,14 @@ enum ggml_op {
        GGML_OP_PERMUTE,
        GGML_OP_TRANSPOSE,
        GGML_OP_GET_ROWS,
+        GGML_OP_GET_ROWS_BACK,
+        GGML_OP_DIAG,
        GGML_OP_DIAG_MASK_INF,
+        GGML_OP_DIAG_MASK_ZERO,
        GGML_OP_SOFT_MAX,
        GGML_OP_ROPE,
+        GGML_OP_ROPE_BACK,
+        GGML_OP_ALIBI,
        GGML_OP_CONV_1D_1S,
        GGML_OP_CONV_1D_2S,

@ -275,6 +341,7 @@ static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
    // n-dimensional tensor
    struct ggml_tensor {
        enum ggml_type    type;
+        enum ggml_backend backend;

        int     n_dims;
        int64_t ne[GGML_MAX_DIMS]; // number of elements
@ -302,7 +369,10 @@ struct ggml_tensor {
        int64_t perf_time_us;

        void * data;
-    char padding[8];
+
+        char name[32];
+
+        char padding[16];
    };

    // computation graph
@ -338,56 +408,67 @@ struct ggml_init_params {
        bool   no_alloc;   // don't allocate memory for the tensor data
    };

-void    ggml_time_init(void); // call this once at the beginning of the program
-int64_t ggml_time_ms(void);
-int64_t ggml_time_us(void);
-int64_t ggml_cycles(void);
-int64_t ggml_cycles_per_ms(void);
+    // misc

-void ggml_print_object (const struct ggml_object * obj);
-void ggml_print_objects(const struct ggml_context * ctx);
+    GGML_API void    ggml_time_init(void); // call this once at the beginning of the program
+    GGML_API int64_t ggml_time_ms(void);
+    GGML_API int64_t ggml_time_us(void);
+    GGML_API int64_t ggml_cycles(void);
+    GGML_API int64_t ggml_cycles_per_ms(void);

-int64_t ggml_nelements(const struct ggml_tensor * tensor);
-size_t  ggml_nbytes   (const struct ggml_tensor * tensor);
+    GGML_API void    ggml_print_object (const struct ggml_object * obj);
+    GGML_API void    ggml_print_objects(const struct ggml_context * ctx);

-int    ggml_blck_size (enum ggml_type type);
-size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
-float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+    GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
+    GGML_API size_t  ggml_nbytes   (const struct ggml_tensor * tensor);

-size_t ggml_element_size(const struct ggml_tensor * tensor);
+    GGML_API int     ggml_blck_size (enum ggml_type type);
+    GGML_API size_t  ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
+    GGML_API float   ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float

-struct ggml_context * ggml_init(struct ggml_init_params params);
-void ggml_free(struct ggml_context * ctx);
+    GGML_API const char * ggml_type_name(enum ggml_type type);

-size_t ggml_used_mem(const struct ggml_context * ctx);
+    GGML_API size_t  ggml_element_size(const struct ggml_tensor * tensor);

-size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+    GGML_API bool    ggml_is_quantized(enum ggml_type type);

-struct ggml_tensor * ggml_new_tensor(
+    // TODO: temporary until model loading of ggml examples is refactored
+    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
+
+    // main
+
+    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
+    GGML_API void    ggml_free(struct ggml_context * ctx);
+
+    GGML_API size_t  ggml_used_mem(const struct ggml_context * ctx);
+
+    GGML_API size_t  ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+
+    GGML_API struct ggml_tensor * ggml_new_tensor(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int    n_dims,
            const int64_t *ne);

-struct ggml_tensor * ggml_new_tensor_1d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_1d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0);

-struct ggml_tensor * ggml_new_tensor_2d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_2d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0,
            int64_t ne1);

-struct ggml_tensor * ggml_new_tensor_3d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_3d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0,
            int64_t ne1,
            int64_t ne2);

-struct ggml_tensor * ggml_new_tensor_4d(
+    GGML_API struct ggml_tensor * ggml_new_tensor_4d(
            struct ggml_context * ctx,
            enum   ggml_type type,
            int64_t ne0,
@ -395,122 +476,179 @@ struct ggml_tensor * ggml_new_tensor_4d(
            int64_t ne2,
            int64_t ne3);

-struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
-struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
+    GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
+    GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);

-struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
-struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
+    GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);

-struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
-struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
-struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
+    GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
+    GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
+    GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);

-int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
-void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
+    GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);

-float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
-void  ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
+    GGML_API float   ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
+    GGML_API void    ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);

- void * ggml_get_data    (const struct ggml_tensor * tensor);
-float * ggml_get_data_f32(const struct ggml_tensor * tensor);
+    GGML_API void *  ggml_get_data    (const struct ggml_tensor * tensor);
+    GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
+
+    GGML_API const char * ggml_get_name(const struct ggml_tensor * tensor);
+    GGML_API void         ggml_set_name(struct ggml_tensor * tensor, const char * name);

    //
    // operations on tensors with backpropagation
    //

-struct ggml_tensor * ggml_dup(
+    GGML_API struct ggml_tensor * ggml_dup(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-struct ggml_tensor * ggml_add(
+    GGML_API struct ggml_tensor * ggml_add(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-struct ggml_tensor * ggml_sub(
+    GGML_API struct ggml_tensor * ggml_add_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-struct ggml_tensor * ggml_mul(
+    GGML_API struct ggml_tensor * ggml_add1(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-struct ggml_tensor * ggml_div(
+    GGML_API struct ggml_tensor * ggml_acc(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_acc_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_sub(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-struct ggml_tensor * ggml_sqr(
+    GGML_API struct ggml_tensor * ggml_mul(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_div(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    GGML_API struct ggml_tensor * ggml_sqr(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-struct ggml_tensor * ggml_sqrt(
+    GGML_API struct ggml_tensor * ggml_sqrt(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_log(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_log_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

    // return scalar
-// TODO: compute sum along rows
-struct ggml_tensor * ggml_sum(
+    GGML_API struct ggml_tensor * ggml_sum(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
+    GGML_API struct ggml_tensor * ggml_sum_rows(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

    // mean along rows
-struct ggml_tensor * ggml_mean(
+    GGML_API struct ggml_tensor * ggml_mean(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

    // if a is the same shape as b, and a is not parameter, return a
    // otherwise, return a new tensor: repeat(a) to fit in b
-struct ggml_tensor * ggml_repeat(
+    GGML_API struct ggml_tensor * ggml_repeat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-struct ggml_tensor * ggml_abs(
+    GGML_API struct ggml_tensor * ggml_abs(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-struct ggml_tensor * ggml_sgn(
+    GGML_API struct ggml_tensor * ggml_sgn(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-struct ggml_tensor * ggml_neg(
+    GGML_API struct ggml_tensor * ggml_neg(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-struct ggml_tensor * ggml_step(
+    GGML_API struct ggml_tensor * ggml_step(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-struct ggml_tensor * ggml_relu(
+    GGML_API struct ggml_tensor * ggml_relu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

    // TODO: double-check this computation is correct
-struct ggml_tensor * ggml_gelu(
+    GGML_API struct ggml_tensor * ggml_gelu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-struct ggml_tensor * ggml_silu(
+    GGML_API struct ggml_tensor * ggml_silu(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_silu_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    // normalize along rows
    // TODO: eps is hardcoded to 1e-5 for now
-struct ggml_tensor * ggml_norm(
+    GGML_API struct ggml_tensor * ggml_norm(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-struct ggml_tensor * ggml_rms_norm(
+    GGML_API struct ggml_tensor * ggml_rms_norm(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

+    // a - x
+    // b - dy
+    GGML_API struct ggml_tensor * ggml_rms_norm_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
    // A: m rows, n columns
    // B: p rows, n columns (i.e. we transpose it internally)
    // result is m columns, p rows
-struct ggml_tensor * ggml_mul_mat(
+    GGML_API struct ggml_tensor * ggml_mul_mat(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);
@ -519,33 +657,92 @@ struct ggml_tensor * ggml_mul_mat(
    // operations on tensors without backpropagation
    //

-// in-place, returns view(a)
-struct ggml_tensor * ggml_scale(
+    GGML_API struct ggml_tensor * ggml_scale(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_scale_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                nb2,
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_set_1d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return modified a
+    GGML_API struct ggml_tensor * ggml_set_2d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+    // b -> view(a,offset,nb1,nb2,3), return view(a)
+    GGML_API struct ggml_tensor * ggml_set_2d_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            size_t                nb1,
+            size_t                offset);
+
+
    // a -> b, return view(b)
-struct ggml_tensor * ggml_cpy(
+    GGML_API struct ggml_tensor * ggml_cpy(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

    // make contiguous
-struct ggml_tensor * ggml_cont(
+    GGML_API struct ggml_tensor * ggml_cont(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

    // return view(a), b specifies the new shape
    // TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape(
+    GGML_API struct ggml_tensor * ggml_reshape(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

    // return view(a)
    // TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape_2d(
+    GGML_API struct ggml_tensor * ggml_reshape_1d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0);
+
+    GGML_API struct ggml_tensor * ggml_reshape_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -553,21 +750,29 @@ struct ggml_tensor * ggml_reshape_2d(

    // return view(a)
    // TODO: when we start computing gradient, make a copy instead of view
-struct ggml_tensor * ggml_reshape_3d(
+    GGML_API struct ggml_tensor * ggml_reshape_3d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
            int64_t               ne1,
            int64_t               ne2);

+    GGML_API struct ggml_tensor * ggml_reshape_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3);
+
    // offset in bytes
-struct ggml_tensor * ggml_view_1d(
+    GGML_API struct ggml_tensor * ggml_view_1d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
            size_t                offset);

-struct ggml_tensor * ggml_view_2d(
+    GGML_API struct ggml_tensor * ggml_view_2d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -575,7 +780,7 @@ struct ggml_tensor * ggml_view_2d(
            size_t                nb1, // row stride in bytes
            size_t                offset);

-struct ggml_tensor * ggml_view_3d(
+    GGML_API struct ggml_tensor * ggml_view_3d(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int64_t               ne0,
@ -585,7 +790,19 @@ struct ggml_tensor * ggml_view_3d(
            size_t                nb2, // slice stride in bytes
            size_t                offset);

-struct ggml_tensor * ggml_permute(
+    GGML_API struct ggml_tensor * ggml_view_4d(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int64_t               ne0,
+            int64_t               ne1,
+            int64_t               ne2,
+            int64_t               ne3,
+            size_t                nb1, // row   stride in bytes
+            size_t                nb2, // slice stride in bytes
+            size_t                nb3,
+            size_t                offset);
+
+    GGML_API struct ggml_tensor * ggml_permute(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   axis0,
@ -594,60 +811,116 @@ struct ggml_tensor * ggml_permute(
            int                   axis3);

    // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
-struct ggml_tensor * ggml_transpose(
+    GGML_API struct ggml_tensor * ggml_transpose(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

-struct ggml_tensor * ggml_get_rows(
+    GGML_API struct ggml_tensor * ggml_get_rows(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

+    GGML_API struct ggml_tensor * ggml_get_rows_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            struct ggml_tensor  * b,
+            struct ggml_tensor  * c);
+
+    GGML_API struct ggml_tensor * ggml_diag(
+        struct ggml_context     * ctx,
+        struct ggml_tensor      * a);
+
    // set elements above the diagonal to -INF
-// in-place, returns view(a)
-struct ggml_tensor * ggml_diag_mask_inf(
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past);

    // in-place, returns view(a)
-struct ggml_tensor * ggml_soft_max(
+    GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // set elements above the diagonal to 0
+    GGML_API struct ggml_tensor * ggml_diag_mask_zero(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past);
+
+    GGML_API struct ggml_tensor * ggml_soft_max(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_max_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);

    // rotary position embedding
-// in-place, returns view(a)
-// if mode == 1, skip n_past elements
+    // if mode & 1 == 1, skip n_past elements
+    // if mode & 2 == 1, GPT-NeoX style
    // TODO: avoid creating a new tensor every time
-struct ggml_tensor * ggml_rope(
+    GGML_API struct ggml_tensor * ggml_rope(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            int                   n_past,
            int                   n_dims,
            int                   mode);

+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_rope_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
+
+    // rotary position embedding backward, i.e compute dx from dy
+    // a - dy
+    GGML_API struct ggml_tensor * ggml_rope_back(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_dims,
+            int                   mode);
+
+    // alibi position embedding
+    // in-place, returns view(a)
+    struct ggml_tensor * ggml_alibi(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,
+            int                   n_past,
+            int                   n_head);
+
    // padding = 1
    // TODO: we don't support extra parameters for now
    //       that's why we are hard-coding the stride, padding, and dilation
    //       not great ..
-struct ggml_tensor * ggml_conv_1d_1s(
+    GGML_API struct ggml_tensor * ggml_conv_1d_1s(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-struct ggml_tensor * ggml_conv_1d_2s(
+    GGML_API struct ggml_tensor * ggml_conv_1d_2s(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b);

-struct ggml_tensor * ggml_flash_attn(
+    GGML_API struct ggml_tensor * ggml_flash_attn(
            struct ggml_context * ctx,
            struct ggml_tensor  * q,
            struct ggml_tensor  * k,
            struct ggml_tensor  * v,
            bool                  masked);

-struct ggml_tensor * ggml_flash_ff(
+    GGML_API struct ggml_tensor * ggml_flash_ff(
            struct ggml_context * ctx,
            struct ggml_tensor  * a,
            struct ggml_tensor  * b0,
@ -659,38 +932,38 @@ struct ggml_tensor * ggml_flash_ff(
    typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
    typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);

-struct ggml_tensor * ggml_map_unary_f32(
+    GGML_API struct ggml_tensor * ggml_map_unary_f32(
            struct ggml_context        * ctx,
            struct ggml_tensor         * a,
-        const  ggml_unary_op_f32_t fun);
+                   ggml_unary_op_f32_t   fun);

-struct ggml_tensor * ggml_map_binary_f32(
+    GGML_API struct ggml_tensor * ggml_map_binary_f32(
            struct ggml_context         * ctx,
            struct ggml_tensor          * a,
            struct ggml_tensor          * b,
-        const  ggml_binary_op_f32_t fun);
+                   ggml_binary_op_f32_t   fun);

    //
    // automatic differentiation
    //

-void ggml_set_param(
+    GGML_API void ggml_set_param(
            struct ggml_context * ctx,
            struct ggml_tensor * tensor);

-void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
+    GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);

-struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
-struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
+    GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
+    GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);

-void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-void ggml_graph_reset  (struct ggml_cgraph * cgraph);
+    GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API void ggml_graph_reset  (struct ggml_cgraph * cgraph);

    // print info and performance information for the graph
-void ggml_graph_print(const struct ggml_cgraph * cgraph);
+    GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);

    // dump the graph into a file using the dot format
-void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
+    GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);

    //
    // optimization
@ -783,10 +1056,10 @@ struct ggml_opt_params {
        } lbfgs;
    };

-struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
+    GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);

    // optimize the function defined by the tensor f
-enum ggml_opt_result ggml_opt(
+    GGML_API enum ggml_opt_result ggml_opt(
            struct ggml_context * ctx,
            struct ggml_opt_params params,
            struct ggml_tensor * f);
@ -795,26 +1068,35 @@ enum ggml_opt_result ggml_opt(
    // quantization
    //

-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
+    GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
+
+    GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);

    //
    // system info
    //

-int ggml_cpu_has_avx(void);
-int ggml_cpu_has_avx2(void);
-int ggml_cpu_has_avx512(void);
-int ggml_cpu_has_fma(void);
-int ggml_cpu_has_neon(void);
-int ggml_cpu_has_arm_fma(void);
-int ggml_cpu_has_f16c(void);
-int ggml_cpu_has_fp16_va(void);
-int ggml_cpu_has_wasm_simd(void);
-int ggml_cpu_has_blas(void);
-int ggml_cpu_has_sse3(void);
-int ggml_cpu_has_vsx(void);
-
+    GGML_API int ggml_cpu_has_avx        (void);
+    GGML_API int ggml_cpu_has_avx2       (void);
+    GGML_API int ggml_cpu_has_avx512     (void);
+    GGML_API int ggml_cpu_has_avx512_vbmi(void);
+    GGML_API int ggml_cpu_has_avx512_vnni(void);
+    GGML_API int ggml_cpu_has_fma        (void);
+    GGML_API int ggml_cpu_has_neon       (void);
+    GGML_API int ggml_cpu_has_arm_fma    (void);
+    GGML_API int ggml_cpu_has_f16c       (void);
+    GGML_API int ggml_cpu_has_fp16_va    (void);
+    GGML_API int ggml_cpu_has_wasm_simd  (void);
+    GGML_API int ggml_cpu_has_blas       (void);
+    GGML_API int ggml_cpu_has_cublas     (void);
+    GGML_API int ggml_cpu_has_clblast    (void);
+    GGML_API int ggml_cpu_has_gpublas    (void);
+    GGML_API int ggml_cpu_has_sse3       (void);
+    GGML_API int ggml_cpu_has_vsx        (void);

    //
    // Internal types and functions exposed for tests and benchmarks
@ -834,7 +1116,9 @@ typedef struct {
        dequantize_row_q_t dequantize_row_q;
        quantize_row_q_t   quantize_row_q;
        quantize_row_q_t   quantize_row_q_reference;
+        quantize_row_q_t   quantize_row_q_dot;
        vec_dot_q_t        vec_dot_q;
+        enum ggml_type     vec_dot_type;
    } quantize_fns_t;

    quantize_fns_t ggml_internal_get_quantize_fn(size_t i);
--- a/models/README.md
+++ b/models/README.md
@ -1,15 +1,17 @@
 ## Whisper model files in custom ggml format

 The [original Whisper PyTorch models provided by OpenAI](https://github.com/openai/whisper/blob/main/whisper/__init__.py#L17-L27)
-have been converted to custom `ggml` format in order to be able to load them in C/C++. The conversion has been performed
-using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either obtain the original models and generate
-the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
-script to download the already converted models. Currently, they are hosted on the following locations:
+are converted to custom `ggml` format in order to be able to load them in C/C++.
+Conversion is performed using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script.
+
+You can either obtain the original models and generate the `ggml` files yourself using the conversion script,
+or you can use the [download-ggml-model.sh](download-ggml-model.sh) script to download the already converted models.
+Currently, they are hosted on the following locations:

 - https://huggingface.co/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

-Sample usage:
+Sample download:

 ```java
 $ ./download-ggml-model.sh base.en
@ -21,6 +23,16 @@ You can now use it like this:
  $ ./main -m models/ggml-base.en.bin -f samples/jfk.wav
 ```

+To convert the files yourself, use the convert-pt-to-ggml.py script. Here is an example usage.
+The original PyTorch files are assumed to have been downloaded into ~/.cache/whisper
+Change `~/path/to/repo/whisper/` to the location for your copy of the Whisper source:
+```
+mkdir models/whisper-medium
+python models/convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium
+mv ./models/whisper-medium/ggml-model.bin models/ggml-medium.bin
+rmdir models/whisper-medium
+```
+
 A third option to obtain the model files is to download them from Hugging Face:

 https://huggingface.co/ggerganov/whisper.cpp/tree/main
--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@ -23,6 +23,7 @@ import json
 import code
 import torch
 import numpy as np
+from pathlib import Path

 from transformers import WhisperForConditionalGeneration

@ -75,16 +76,13 @@ if len(sys.argv) < 4:
    print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n")
    sys.exit(1)

-dir_model   = sys.argv[1]
-dir_whisper = sys.argv[2]
-dir_out     = sys.argv[3]
+dir_model   = Path(sys.argv[1])
+dir_whisper = Path(sys.argv[2])
+dir_out     = Path(sys.argv[3])

-with open(dir_model + "/vocab.json", "r", encoding="utf8") as f:
-    encoder = json.load(f)
-with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f:
-    encoder_added = json.load(f)
-with open(dir_model + "/config.json", "r", encoding="utf8") as f:
-    hparams = json.load(f)
+encoder = json.load((dir_model / "vocab.json").open("r", encoding="utf8"))
+encoder_added = json.load((dir_model / "added_tokens.json").open( "r", encoding="utf8"))
+hparams = json.load((dir_model / "config.json").open("r", encoding="utf8") )

 model = WhisperForConditionalGeneration.from_pretrained(dir_model)

@ -96,16 +94,15 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as

 dir_tokenizer = dir_model

-fname_out = dir_out + "/ggml-model.bin"
+fname_out = dir_out / "ggml-model.bin"

-with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
-    tokens = json.load(f)
+tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8"))

 # use 16-bit or 32-bit floats
 use_f16 = True
 if len(sys.argv) > 4:
    use_f16 = False
-    fname_out = dir_out + "/ggml-model-f32.bin"
+    fname_out = dir_out / "ggml-model-f32.bin"

 fout = open(fname_out, "wb")

@ -171,10 +168,9 @@ for name in list_vars.keys():
    data = data.astype(np.float16)

    # reshape conv bias from [n] to [n, 1]
-    if name == "encoder.conv1.bias" or \
-       name == "encoder.conv2.bias":
+    if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
        data = data.reshape(data.shape[0], 1)
-        print("  Reshaped variable: " + name + " to shape: ", data.shape)
+        print("  Reshaped variable: " , name , " to shape: ", data.shape)

    n_dims = len(data.shape)
    print(name, n_dims, data.shape)
@ -182,7 +178,7 @@ for name in list_vars.keys():
    # looks like the whisper models are in f16 by default
    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 1;
+    ftype = 1
    if use_f16:
        if n_dims < 2 or \
                name == "encoder.conv1.bias"   or \
@ -197,16 +193,16 @@ for name in list_vars.keys():
        ftype = 0

    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    str_ = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str_), ftype))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
+    fout.write(str_)

    # data
    data.tofile(fout)

 fout.close()

-print("Done. Output file: " + fname_out)
+print("Done. Output file: " , fname_out)
 print("")
--- a/models/convert-pt-to-ggml.py
+++ b/models/convert-pt-to-ggml.py
@ -40,7 +40,7 @@ import code
 import torch
 import numpy as np
 import base64
-
+from pathlib import Path
 #from transformers import GPTJForCausalLM
 #from transformers import GPT2TokenizerFast

@ -194,17 +194,17 @@ if len(sys.argv) < 4:
    print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
    sys.exit(1)

-fname_inp   = sys.argv[1]
-dir_whisper = sys.argv[2]
-dir_out     = sys.argv[3]
+fname_inp   = Path(sys.argv[1])
+dir_whisper = Path(sys.argv[2])
+dir_out     = Path(sys.argv[3])

 # try to load PyTorch binary data
 try:
    model_bytes = open(fname_inp, "rb").read()
    with io.BytesIO(model_bytes) as fp:
        checkpoint = torch.load(fp, map_location="cpu")
-except:
-    print("Error: failed to load PyTorch model file: %s" % fname_inp)
+except Exception:
+    print("Error: failed to load PyTorch model file:" , fname_inp)
    sys.exit(1)

 hparams = checkpoint["dims"]
@ -218,17 +218,17 @@ list_vars = checkpoint["model_state_dict"]

 # load mel filters
 n_mels = hparams["n_mels"]
-with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
+with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
    filters = torch.from_numpy(f[f"mel_{n_mels}"])
    #print (filters)

 #code.interact(local=locals())

 multilingual = hparams["n_vocab"] == 51865
-tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
+tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")

 # output in the same directory as the model
-fname_out = dir_out + "/ggml-model.bin"
+fname_out = dir_out / "ggml-model.bin"

 with open(tokenizer, "rb") as f:
    contents = f.read()
@ -238,9 +238,9 @@ with open(tokenizer, "rb") as f:
 use_f16 = True
 if len(sys.argv) > 4:
    use_f16 = False
-    fname_out = dir_out + "/ggml-model-f32.bin"
+    fname_out = dir_out / "ggml-model-f32.bin"

-fout = open(fname_out, "wb")
+fout = fname_out.open("wb")

 fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
 fout.write(struct.pack("i", hparams["n_vocab"]))
@ -273,20 +273,19 @@ for key in tokens:

 for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
-    print("Processing variable: " + name + " with shape: ", data.shape)
+    print("Processing variable: " , name ,  " with shape: ", data.shape)

    # reshape conv bias from [n] to [n, 1]
-    if name == "encoder.conv1.bias" or \
-       name == "encoder.conv2.bias":
+    if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
        data = data.reshape(data.shape[0], 1)
-        print("  Reshaped variable: " + name + " to shape: ", data.shape)
+        print(f"  Reshaped variable: {name} to shape: ", data.shape)

-    n_dims = len(data.shape);
+    n_dims = len(data.shape)

    # looks like the whisper models are in f16 by default
    # so we need to convert the small tensors to f32 until we fully support f16 in ggml
    # ftype == 0 -> float32, ftype == 1 -> float16
-    ftype = 1;
+    ftype = 1
    if use_f16:
        if n_dims < 2 or \
                name == "encoder.conv1.bias"   or \
@ -307,16 +306,16 @@ for name in list_vars.keys():
    #        data = data.transpose()

    # header
-    str = name.encode('utf-8')
-    fout.write(struct.pack("iii", n_dims, len(str), ftype))
+    str_ = name.encode('utf-8')
+    fout.write(struct.pack("iii", n_dims, len(str_), ftype))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
-    fout.write(str);
+    fout.write(str_)

    # data
    data.tofile(fout)

 fout.close()

-print("Done. Output file: " + fname_out)
+print("Done. Output file: " , fname_out)
 print("")
--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@ -20,7 +20,7 @@ def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
    """
    for k in state_dict:
        is_attention = all(substr in k for substr in ['attn', '.weight'])
-        is_mlp = any([k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight']])
+        is_mlp = any(k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight'])

        if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
            state_dict[k] = state_dict[k][:, :, None, None]
@ -42,11 +42,10 @@ class LayerNormANE(LayerNormANEBase):
 class MultiHeadAttentionANE(MultiHeadAttention):
    def __init__(self, n_state: int, n_head: int):
        super().__init__(n_state, n_head)
-
-        setattr(self, 'query', nn.Conv2d(n_state, n_state, kernel_size=1))
-        setattr(self, 'key', nn.Conv2d(n_state, n_state, kernel_size=1, bias=False))
-        setattr(self, 'value', nn.Conv2d(n_state, n_state, kernel_size=1))
-        setattr(self, 'out', nn.Conv2d(n_state, n_state, kernel_size=1))
+        self.query =  nn.Conv2d(n_state, n_state, kernel_size=1)
+        self.key = nn.Conv2d(n_state, n_state, kernel_size=1, bias=False)
+        self.value = nn.Conv2d(n_state, n_state, kernel_size=1)
+        self.out = nn.Conv2d(n_state, n_state, kernel_size=1)

    def forward(self,
                x: Tensor,
@ -104,30 +103,28 @@ class MultiHeadAttentionANE(MultiHeadAttention):
 class ResidualAttentionBlockANE(ResidualAttentionBlock):
    def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
        super().__init__(n_state, n_head, cross_attention)
-
-        setattr(self, 'attn', MultiHeadAttentionANE(n_state, n_head))
-        setattr(self, 'attn_ln', LayerNormANE(n_state))
-
-        setattr(self, 'cross_attn', MultiHeadAttentionANE(n_state, n_head) if cross_attention else None)
-        setattr(self, 'cross_attn_ln', LayerNormANE(n_state) if cross_attention else None)
+        self.attn =  MultiHeadAttentionANE(n_state, n_head)
+        self.attn_ln = LayerNormANE(n_state)
+        self.cross_attn =  MultiHeadAttentionANE(n_state, n_head) if cross_attention else None
+        self.cross_attn_ln =  LayerNormANE(n_state) if cross_attention else None

        n_mlp = n_state * 4
-        setattr(self, 'mlp', nn.Sequential(
+        self.mlp =  nn.Sequential(
            nn.Conv2d(n_state, n_mlp, kernel_size=1),
            nn.GELU(),
            nn.Conv2d(n_mlp, n_state, kernel_size=1)
-        ))
-        setattr(self, 'mlp_ln', LayerNormANE(n_state))
+        )
+        self.mlp_ln = LayerNormANE(n_state)


 class AudioEncoderANE(AudioEncoder):
    def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
        super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)

-        setattr(self, 'blocks', nn.ModuleList(
+        self.blocks = nn.ModuleList(
            [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
-        ))
-        setattr(self, 'ln_post', LayerNormANE(n_state))
+        )
+        self.ln_post = LayerNormANE(n_state)

    def forward(self, x: Tensor):
        """
@ -168,10 +165,10 @@ class TextDecoderANE(TextDecoder):
    def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
        super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)

-        setattr(self, 'blocks', nn.ModuleList(
+        self.blocks= nn.ModuleList(
            [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
-        ))
-        setattr(self, 'ln', LayerNormANE(n_state))
+        )
+        self.ln= LayerNormANE(n_state)

    def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
        """
@ -213,20 +210,20 @@ class WhisperANE(Whisper):
    def __init__(self, dims: ModelDimensions):
        super().__init__(dims)

-        setattr(self, 'encoder', AudioEncoderANE(
+        self.encoder = AudioEncoderANE(
            self.dims.n_mels,
            self.dims.n_audio_ctx,
            self.dims.n_audio_state,
            self.dims.n_audio_head,
            self.dims.n_audio_layer,
-        ))
-        setattr(self, 'decoder', TextDecoderANE(
+        )
+        self.decoder = TextDecoderANE(
            self.dims.n_vocab,
            self.dims.n_text_ctx,
            self.dims.n_text_state,
            self.dims.n_text_head,
            self.dims.n_text_layer,
-        ))
+        )

        self._register_load_state_dict_pre_hook(linear_to_conv2d_map)

--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -62,7 +62,7 @@ if [ -f "ggml-$model.bin" ]; then
 fi

 if [ -x "$(command -v wget)" ]; then
-    wget --quiet --show-progress -O ggml-$model.bin $src/$pfx-$model.bin
+    wget --no-config --quiet --show-progress -O ggml-$model.bin $src/$pfx-$model.bin
 elif [ -x "$(command -v curl)" ]; then
    curl -L --output ggml-$model.bin $src/$pfx-$model.bin
 else
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@ -13,7 +13,7 @@
 #
 # Usage:
 #
-#   ./tests/run-tests.sh <model_name>
+#   ./tests/run-tests.sh <model_name> [threads]
 #

 cd `dirname $0`
@ -32,7 +32,7 @@ function list_models {
 }

 if [ $# -eq 0 ]; then
-    printf "Usage: $0 [model]\n\n"
+    printf "Usage: $0 [model] [threads]\n\n"
    printf "No model specified. Aborting\n"
    list_models
    exit 1
@ -41,6 +41,11 @@ fi
 model=$1
 main="../main"

+threads=""
+if [ $# -eq 2 ]; then
+    threads="-t $2"
+fi
+
 if [ ! -f ../models/ggml-$model.bin ]; then
    printf "Model $model not found. Aborting\n"
    list_models
@ -105,7 +110,7 @@ function run_lang() {
            fi
        fi

-        $main -m ../models/ggml-$model.bin -f $fname_dst -l $lang -otxt 2> /dev/null
+        $main -m ../models/ggml-$model.bin $threads -f $fname_dst -l $lang -otxt 2> /dev/null

        git diff --no-index --word-diff=color --word-diff-regex=. $lang-$i-ref.txt $fname_dst.txt

--- a/whisper.cpp
+++ b/whisper.cpp
@ -1,4 +1,3 @@
-#define WHISPER_BUILD
 #include "whisper.h"
 #if WHISPER_USE_COREML
 #include "coreml/whisper-encoder.h"
@ -102,7 +101,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
 #define WHISPER_PRINT_DEBUG(...)
 #endif

-#define WHISPER_USE_FLASH_ATTN
+//#define WHISPER_USE_FLASH_ATTN
 //#define WHISPER_USE_FLASH_FF
 #define WHISPER_MAX_DECODERS 16

@ -224,11 +223,11 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
 static const size_t MB = 1ull*1024*1024;

 static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
-    { MODEL_TINY,     14ull*MB },
-    { MODEL_BASE,     18ull*MB },
-    { MODEL_SMALL,    28ull*MB },
-    { MODEL_MEDIUM,   36ull*MB },
-    { MODEL_LARGE,    44ull*MB },
+    { MODEL_TINY,     62ull*MB },
+    { MODEL_BASE,     80ull*MB },
+    { MODEL_SMALL,   120ull*MB },
+    { MODEL_MEDIUM,  158ull*MB },
+    { MODEL_LARGE,   198ull*MB },
 };

 static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
@ -255,12 +254,70 @@ static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
    { MODEL_LARGE,     9ull*MB },
 };

-static const std::map<e_model, size_t> MEM_REQ_MODEL = {
+static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
+    { GGML_TYPE_F32,
+        {
            { MODEL_TINY,     74ull*MB },
            { MODEL_BASE,    142ull*MB },
            { MODEL_SMALL,   466ull*MB },
            { MODEL_MEDIUM, 1464ull*MB },
            { MODEL_LARGE,  2952ull*MB },
+        },
+    },
+    { GGML_TYPE_F16,
+        {
+            { MODEL_TINY,     74ull*MB },
+            { MODEL_BASE,    142ull*MB },
+            { MODEL_SMALL,   466ull*MB },
+            { MODEL_MEDIUM, 1464ull*MB },
+            { MODEL_LARGE,  2952ull*MB },
+        },
+    },
+    { GGML_TYPE_Q4_0,
+        {
+            { MODEL_TINY,     26ull*MB },
+            { MODEL_BASE,     50ull*MB },
+            { MODEL_SMALL,   154ull*MB },
+            { MODEL_MEDIUM,  470ull*MB },
+            { MODEL_LARGE,   940ull*MB },
+        },
+    },
+    { GGML_TYPE_Q4_1,
+        {
+            { MODEL_TINY,     32ull*MB },
+            { MODEL_BASE,     58ull*MB },
+            { MODEL_SMALL,   182ull*MB },
+            { MODEL_MEDIUM,  562ull*MB },
+            { MODEL_LARGE,  1124ull*MB },
+        },
+    },
+    { GGML_TYPE_Q5_0,
+        {
+            { MODEL_TINY,     30ull*MB },
+            { MODEL_BASE,     54ull*MB },
+            { MODEL_SMALL,   170ull*MB },
+            { MODEL_MEDIUM,  516ull*MB },
+            { MODEL_LARGE,  1034ull*MB },
+        },
+    },
+    { GGML_TYPE_Q5_1,
+        {
+            { MODEL_TINY,     32ull*MB },
+            { MODEL_BASE,     58ull*MB },
+            { MODEL_SMALL,   182ull*MB },
+            { MODEL_MEDIUM,  562ull*MB },
+            { MODEL_LARGE,  1124ull*MB },
+        },
+    },
+    { GGML_TYPE_Q8_0,
+        {
+            { MODEL_TINY,     45ull*MB },
+            { MODEL_BASE,     84ull*MB },
+            { MODEL_SMALL,   268ull*MB },
+            { MODEL_MEDIUM,  834ull*MB },
+            { MODEL_LARGE,  1674ull*MB },
+        },
+    },
 };

 static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
@ -280,11 +337,11 @@ static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
 };

 static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
-    { MODEL_TINY,      6ull*MB },
-    { MODEL_BASE,      8ull*MB },
-    { MODEL_SMALL,    13ull*MB },
-    { MODEL_MEDIUM,   22ull*MB },
-    { MODEL_LARGE,    33ull*MB },
+    { MODEL_TINY,     30ull*MB },
+    { MODEL_BASE,     38ull*MB },
+    { MODEL_SMALL,    56ull*MB },
+    { MODEL_MEDIUM,   74ull*MB },
+    { MODEL_LARGE,    94ull*MB },
 };

 static const std::map<e_model, size_t> MEM_REQ_DECODE = {
@ -370,7 +427,7 @@ struct whisper_hparams {
    int32_t n_text_head   = 6;
    int32_t n_text_layer  = 4;
    int32_t n_mels        = 80;
-    int32_t f16           = 1;
+    int32_t ftype         = 1;
 };

 // audio encoding layer
@ -592,7 +649,7 @@ struct whisper_state {

    std::string path_model; // populated by whisper_init_from_file()
 #ifdef WHISPER_USE_COREML
-    whisper_coreml_context * ctx_coreml;
+    whisper_coreml_context * ctx_coreml = nullptr;
 #endif

    // [EXPERIMENTAL] token-level timestamps data
@ -640,7 +697,8 @@ struct whisper_context {
    int64_t t_load_us  = 0;
    int64_t t_start_us = 0;

-    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16)
+    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
+    ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)

    whisper_model model;
    whisper_vocab vocab;
@ -697,7 +755,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
    const ggml_type wtype = cache.k->type;
    WHISPER_ASSERT(wtype == cache.v->type);

-    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
+    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));

    struct ggml_init_params params = {
        /*.mem_size   =*/ cache.buf.size(),
@ -770,7 +828,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        read_safe(loader, hparams.n_text_head);
        read_safe(loader, hparams.n_text_layer);
        read_safe(loader, hparams.n_mels);
-        read_safe(loader, hparams.f16);
+        read_safe(loader, hparams.ftype);

        assert(hparams.n_text_state == hparams.n_audio_state);

@ -794,11 +852,19 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
            model.type = e_model::MODEL_LARGE;
        }

-        // for the big tensors, we have the option to store the data in 16-bit floats
-        // in order to save memory and also to speed up the computation
-        wctx.wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;

-        const size_t scale = model.hparams.f16 ? 1 : 2;
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+
+        // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+        // in order to save memory and also to speed up the computation
+        wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+        if (wctx.wtype == GGML_TYPE_COUNT) {
+            fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
+            return false;
+        }
+
+        const size_t scale = model.hparams.ftype ? 1 : 2;

        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
@ -810,7 +876,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
-        fprintf(stderr, "%s: f16           = %d\n", __func__, hparams.f16);
+        fprintf(stderr, "%s: ftype         = %d\n", __func__, model.hparams.ftype);
+        fprintf(stderr, "%s: qntvr         = %d\n", __func__, qntvr);
        fprintf(stderr, "%s: type          = %d\n", __func__, model.type);

        // print memory requirements
@ -821,7 +888,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                     MEM_REQ_SCRATCH1.at(model.type) +
                     MEM_REQ_SCRATCH2.at(model.type) +
                     MEM_REQ_SCRATCH3.at(model.type) +
-                scale*MEM_REQ_MODEL.at   (model.type) +
+                scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) +
                scale*MEM_REQ_KV_CROSS.at(model.type) +
                scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));

@ -837,7 +904,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        // always have at least one decoder

        wctx.model.buf = new std::vector<uint8_t>();
-        wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(model.type));
+        wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type));

        // we skip initialization of the state until it is needed
        // because it might be that state will always be provided externally.
@ -928,6 +995,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
    size_t ctx_size = 0;

    const ggml_type wtype = wctx.wtype;
+    const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type

    {
        const auto & hparams = model.hparams;
@ -946,95 +1014,95 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

        // encoder
        {
-            ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe;
+            ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;

-            ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype);         // e_conv_1_w
-            ctx_size +=          n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_1_b
+            ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype);         // e_conv_1_w
+            ctx_size +=          n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b

-            ctx_size += 3*n_audio_state*n_audio_state*ggml_type_size(wtype);         // e_conv_2_w
-            ctx_size +=                 n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_2_b
+            ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype);         // e_conv_2_w
+            ctx_size +=                 n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b

-            ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_w;
-            ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_b;
+            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
+            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
        }

        // decoder
        {
-            ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe;
+            ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;

-            ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te;
+            ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;

-            ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_w;
-            ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_b;
+            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
+            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
        }

        // encoder layers
        {
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b

-            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype));         // mlp_0_w
-            ctx_size += n_audio_layer*(              4*n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_0_w
+            ctx_size += n_audio_layer*(              4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b

-            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype));         // mlp_1_w
-            ctx_size += n_audio_layer*(                n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_1_w
+            ctx_size += n_audio_layer*(                n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b

-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b

-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_q_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_q_w
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b

-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_k_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w

-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_v_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_v_w
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b

-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_ln_1_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_ln_1_w
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
        }

        // decoder layers
        {
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b

-            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype));         // mlp_0_w
-            ctx_size += n_text_layer*(             4*n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_0_w
+            ctx_size += n_text_layer*(             4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b

-            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype));         // mlp_1_w
-            ctx_size += n_text_layer*(               n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_1_w
+            ctx_size += n_text_layer*(               n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b

-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b

-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_q_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_q_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b

-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_k_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w

-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_v_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_v_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b

-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_ln_1_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_ln_1_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
                                                                                                //
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_b
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b

-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_q_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_q_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_q_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b

-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_k_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w

-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_v_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_v_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_v_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b

-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_ln_1_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_ln_1_w
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
        }

-        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
+        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead

        fprintf(stderr, "%s: model ctx     = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
    }
@ -1079,10 +1147,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        {
            model.e_pe       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);

-            model.e_conv_1_w = ggml_new_tensor_3d(ctx, wtype,         3, n_mels, n_audio_state);
+            model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype,         3, n_mels, n_audio_state);
            model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);

-            model.e_conv_2_w = ggml_new_tensor_3d(ctx, wtype,         3, n_audio_state, n_audio_state);
+            model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype,         3, n_audio_state, n_audio_state);
            model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);

            model.e_ln_w     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
@ -1259,18 +1327,18 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;

            read_safe(loader, n_dims);
            read_safe(loader, length);
-            read_safe(loader, ftype);
+            read_safe(loader, ttype);

            if (loader->eof(loader->context)) {
                break;
            }

            int32_t nelements = 1;
-            int32_t ne[3] = { 1, 1, 1 };
+            int32_t ne[4] = { 1, 1, 1, 1 };
            for (int i = 0; i < n_dims; ++i) {
                read_safe(loader, ne[i]);
                nelements *= ne[i];
@ -1289,6 +1357,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
            auto tensor = model.tensors[name.data()];
            if (ggml_nelements(tensor) != nelements) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+                fprintf(stderr, "%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
+                        __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
                return false;
            }

@ -1298,9 +1368,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                return false;
            }

-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            const size_t bpe = ggml_type_size(ggml_type(ttype));

-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@ -1309,7 +1379,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
            loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
            BYTESWAP_TENSOR(tensor);

-            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
            total_size += ggml_nbytes(tensor);
            model.n_loaded++;
        }
@ -1385,9 +1455,15 @@ static bool whisper_encode_internal(
        }
    }

-#ifndef WHISPER_USE_COREML
    struct ggml_tensor * cur;

+#ifndef WHISPER_USE_COREML
+    const bool use_coreml = false;
+#else
+    const bool use_coreml = wstate.ctx_coreml != nullptr;
+#endif
+
+    if (!use_coreml) {
        // convolution + gelu
        {
            wstate.use_buf(ctx0, 1);
@ -1474,14 +1550,14 @@ static bool whisper_encode_internal(
                            Qcur),
                        Qcur);

-            //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+                //Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

                // note: no bias for Key
                struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                        layer.attn_k_w,
                        cur);

-            //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+                //Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

                struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
                        layer.attn_v_w,
@ -1502,14 +1578,14 @@ static bool whisper_encode_internal(
                    ggml_permute(ctx0,
                            ggml_cpy(ctx0,
                                Qcur,
-                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
+                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                            0, 2, 1, 3);

                struct ggml_tensor * K =
                    ggml_permute(ctx0,
                            ggml_cpy(ctx0,
                                Kcur,
-                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
+                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                            0, 2, 1, 3);

                struct ggml_tensor * V =
@ -1519,7 +1595,7 @@ static bool whisper_encode_internal(
                                    Vcur,
                                    n_state/n_head, n_head, n_ctx),
                                1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
+                            ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));

                struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
 #else
@ -1534,28 +1610,19 @@ static bool whisper_encode_internal(
                    ggml_permute(ctx0,
                            ggml_cpy(ctx0,
                                Kcur,
-                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
+                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                            0, 2, 1, 3);

                // K * Q
                struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

                struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
+                    ggml_scale_inplace(ctx0,
                            KQ,
                            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
                            );

-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
-
-            //struct ggml_tensor * V_trans =
-            //    ggml_permute(ctx0,
-            //            ggml_cpy(ctx0,
-            //                Vcur,
-            //                ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
-            //            1, 2, 0, 3);
-
-            //struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+                struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_scaled);

                struct ggml_tensor * V =
                    ggml_cpy(ctx0,
@ -1563,11 +1630,11 @@ static bool whisper_encode_internal(
                                ggml_reshape_3d(ctx0,
                                    Vcur,
                                    n_state/n_head, n_head, n_ctx),
-                            0, 2, 1, 3),
-                        ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
+                                1, 2, 0, 3),
+                            ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
                            );

-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
+                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
 #endif
                struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);

@ -1622,7 +1689,7 @@ static bool whisper_encode_internal(
                wstate.use_buf(ctx0, 0);

                cur = ggml_flash_ff(ctx0,
-                ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
+                        ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
                        layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
 #else
                wstate.use_buf(ctx0, 0);
@ -1693,12 +1760,16 @@ static bool whisper_encode_internal(

            //ggml_graph_print(&gf);
        }
-#else
+    }
+#ifdef WHISPER_USE_COREML
+    else
+    {
        wstate.use_buf(ctx0, -1);

-    struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
+        cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);

        whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
+    }
 #endif

    // cur
@ -1734,7 +1805,7 @@ static bool whisper_encode_internal(
                layer.cross_attn_k_w,
                cur);

-            Kcross = ggml_scale(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));
+            Kcross = ggml_scale_inplace(ctx0, Kcross, ggml_new_f32(ctx0, pow(float(n_state) / n_head, -0.25)));

            wstate.use_buf(ctx0, 1);

@ -1881,14 +1952,14 @@ static bool whisper_decode_internal(
                        Qcur),
                    Qcur);

-            Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

            // note: no bias for Key
            struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
                    layer.attn_k_w,
                    cur);

-            Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Kcur = ggml_scale_inplace(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

            // store key and value to memory
            {
@ -1937,14 +2008,14 @@ static bool whisper_decode_internal(
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

            //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctx0,
+            //    ggml_scale_inplace(ctx0,
            //            KQ,
            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
            //            );

-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ, n_past);

-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);

            struct ggml_tensor * V =
                ggml_view_3d(ctx0, kv_self.v,
@ -2008,7 +2079,7 @@ static bool whisper_decode_internal(
                        Qcur),
                    Qcur);

-            Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+            Qcur = ggml_scale_inplace(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));

            // Kcross is already scaled
            struct ggml_tensor * Kcross =
@ -2048,15 +2119,15 @@ static bool whisper_decode_internal(
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

            //struct ggml_tensor * KQ_scaled =
-            //    ggml_scale(ctx0,
+            //    ggml_scale_inplace(ctx0,
            //            KQ,
            //            ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
            //            );

            // no masking for cross-attention
-            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);

-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ);

            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);

@ -2356,11 +2427,7 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float>
                sum += fft_out[k] * filters.data[j * n_fft + k];
            }

-            if (sum < 1e-10) {
-                sum = 1e-10;
-            }
-
-            sum = log10(sum);
+            sum = log10(std::max(sum, 1e-10));

            mel.data[j * mel.n_len + i] = sum;
        }
@ -2531,6 +2598,15 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
        path_bin = path_bin.substr(0, pos);
    }

+    // match "-qx_x"
+    pos = path_bin.rfind('-');
+    if (pos != std::string::npos) {
+        auto sub = path_bin.substr(pos);
+        if (sub.size() == 5 && sub[1] == 'q' && sub[3] == '_') {
+            path_bin = path_bin.substr(0, pos);
+        }
+    }
+
    path_bin += "-encoder.mlmodelc";

    return path_bin;
@ -2540,9 +2616,9 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
 struct whisper_state * whisper_init_state(whisper_context * ctx) {
    whisper_state * state = new whisper_state;

-    const size_t scale = ctx->model.hparams.f16 ? 1 : 2;
+    const size_t scale = ctx->model.hparams.ftype ? 1 : 2;

-    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
+    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
        fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
        delete state;
        return nullptr;
@ -2553,7 +2629,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
        fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
    }

-    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->wtype, ctx->model.hparams.n_audio_ctx)) {
+    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
        fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
        delete state;
        return nullptr;
@ -2573,10 +2649,12 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
    state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
    if (!state->ctx_coreml) {
        fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
+#ifndef WHISPER_COREML_ALLOW_FALLBACK
        return nullptr;
-    }
-
+#endif
+    } else {
        fprintf(stderr, "%s: Core ML model loaded\n", __func__);
+    }
 #endif

    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
@ -2602,7 +2680,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
 }

 struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
-    whisper_model_loader loader = {};

    fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);

@ -2612,6 +2689,8 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
        return nullptr;
    }

+    whisper_model_loader loader = {};
+
    loader.context = &fin;

    loader.read = [](void * ctx, void * output, size_t read_size) {
@ -2647,10 +2726,11 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
    };

    buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
-    whisper_model_loader loader = {};

    fprintf(stderr, "%s: loading model from buffer\n", __func__);

+    whisper_model_loader loader = {};
+
    loader.context = &ctx;

    loader.read = [](void * ctx, void * output, size_t read_size) {
@ -2747,8 +2827,10 @@ void whisper_free_state(struct whisper_state * state)
        }

 #ifdef WHISPER_USE_COREML
+        if (state->ctx_coreml != nullptr) {
            whisper_coreml_free(state->ctx_coreml);
            state->ctx_coreml = nullptr;
+        }
 #endif

        delete state;
@ -2909,7 +2991,6 @@ int whisper_lang_id(const char * lang) {
        fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang);
        return -1;
    }
-
    return g_lang.at(lang).first;
 }

@ -3047,8 +3128,8 @@ int whisper_model_n_mels(struct whisper_context * ctx) {
    return ctx->model.hparams.n_mels;
 }

-int whisper_model_f16(struct whisper_context * ctx) {
-    return ctx->model.hparams.f16;
+int whisper_model_ftype(struct whisper_context * ctx) {
+    return ctx->model.hparams.ftype;
 }

 int whisper_model_type(struct whisper_context * ctx) {
@ -3236,6 +3317,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.prompt_n_tokens  =*/ 0,

        /*.language         =*/ "en",
+        /*.detect_language  =*/ false,

        /*.suppress_blank   =*/ true,
        /*.suppress_non_speech_tokens =*/ false,
@ -3303,15 +3385,15 @@ static void whisper_exp_compute_token_level_timestamps(

 // trim from start (in place)
 static inline void ltrim(std::string &s) {
-    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
-        return !std::isspace(ch);
+    s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), [](unsigned char ch) {
+        return std::isspace(ch);
    }));
 }

 // trim from end (in place)
 static inline void rtrim(std::string &s) {
-    s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
-        return !std::isspace(ch);
+    s.erase(std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) {
+        return std::isspace(ch);
    }).base(), s.end());
 }

@ -3822,7 +3904,7 @@ int whisper_full_with_state(
    }

    // auto-detect language if not specified
-    if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0) {
+    if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0 || params.detect_language) {
        std::vector<float> probs(whisper_lang_max_id() + 1, 0.0f);

        const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
@ -3834,6 +3916,9 @@ int whisper_full_with_state(
        params.language = whisper_lang_str(lang_id);

        fprintf(stderr, "%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
+        if (params.detect_language) {
+            return 0;
+        }
    }

    if (params.token_timestamps) {
@ -3844,7 +3929,7 @@ int whisper_full_with_state(
    }

    const int seek_start = params.offset_ms/10;
-    const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len_from_state(state) : params.duration_ms/10);
+    const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;

    // if length of spectrogram is less than 1s (100 samples), then return
    // basically don't process anything that is less than 1s
@ -4751,12 +4836,14 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {

    ggml_time_init();

-    size_t n    = 50;
-    size_t arr  = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations
+    size_t n    = 20;
+    size_t arr  = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations

-    // 1 GB array
+    // 1GB MB array
    const size_t size = arr*1024llu*1024llu;

+    // single-thread
+    {
        char * src = (char *) malloc(size);
        char * dst = (char *) malloc(size);

@ -4765,6 +4852,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
        memcpy(dst, src, size); // heat-up

        double tsum = 0.0;
+        double sum  = 0.0;

        for (size_t i = 0; i < n; i++) {
            const int64_t t0 = ggml_time_us();
@ -4775,24 +4863,23 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {

            tsum += (t1 - t0)*1e-6;

-        src[0] = rand();
+            src[rand() % size] = rand() % 256;
        }

-    snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
+        snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
        s += strbuf;

-    // needed to prevent the compile from optimizing the memcpy away
+        // needed to prevent the compiler from optimizing the memcpy away
        {
-        double sum = 0.0;
-
            for (size_t i = 0; i < size; i++) sum += dst[i];

-        snprintf(strbuf, sizeof(strbuf), "sum:    %s %f\n", sum == -536870910.00 ? "ok" : "error", sum);
+            snprintf(strbuf, sizeof(strbuf), "sum:    %f\n", sum);
            s += strbuf;
        }

        free(src);
        free(dst);
+    }

    return s.c_str();
 }
@ -4821,25 +4908,42 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
    // b: N*N*sizeof(float)
    // c: N*N*sizeof(float)
    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
-    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
+    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*512);

+    // put a bunch of random data in the buffer
    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;

    for (int j = 0; j < (int) sizes.size(); j++) {
+        int n_q4_0 = 0;
+        int n_q4_1 = 0;
+        int n_q5_0 = 0;
+        int n_q5_1 = 0;
+        int n_q8_0 = 0;
        int n_fp16 = 0;
        int n_fp32 = 0;

        // GFLOPS/s
+        double s_q4_0 = 0.0;
+        double s_q4_1 = 0.0;
+        double s_q5_0 = 0.0;
+        double s_q5_1 = 0.0;
+        double s_q8_0 = 0.0;
        double s_fp16 = 0.0;
        double s_fp32 = 0.0;

        const size_t N = sizes[j];

-        for (int k = 0; k < 2; ++k) {
-            const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+        for (int k = 0; k < 7; ++k) {
+            const ggml_type wtype =
+                k == 0 ? GGML_TYPE_Q4_0 :
+                k == 1 ? GGML_TYPE_Q4_1 :
+                k == 2 ? GGML_TYPE_Q5_0 :
+                k == 3 ? GGML_TYPE_Q5_1 :
+                k == 4 ? GGML_TYPE_Q8_0 :
+                k == 5 ? GGML_TYPE_F16  : GGML_TYPE_F32;

-            double & s = k == 0 ? s_fp16 : s_fp32;
-            int    & n = k == 0 ? n_fp16   : n_fp32;
+            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q5_0 : k == 3 ? s_q5_1 : k == 4 ? s_q8_0 : k == 5 ? s_fp16 : /*k == 6*/ s_fp32;
+            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q5_0 : k == 3 ? n_q5_1 : k == 4 ? n_q8_0 : k == 5 ? n_fp16 : /*k == 6*/ n_fp32;

            struct ggml_init_params gparams = {
                /*.mem_size   =*/ buf.size(),
@ -4883,7 +4987,18 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
            s = ((2.0*N*N*N*n)/tsum)*1e-9;
        }

-        snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
+        // Q4_0 | Q4_1
+        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs)\n",
+                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1);
+        s += strbuf;
+
+        // Q5_0 | Q5_1 | Q8_0
+        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n",
+                N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0);
+        s += strbuf;
+
+        // F16 | F32
+        snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: F16  %7.1f GFLOPS (%3d runs) | F32  %7.1f GFLOPS (%3d runs)\n",
                N, N, s_fp16, n_fp16, s_fp32, n_fp32);
        s += strbuf;
    }
--- a/whisper.h
+++ b/whisper.h
@ -258,7 +258,7 @@ extern "C" {
    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_f16          (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_ftype        (struct whisper_context * ctx);
    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);

    // Token logits obtained from the last call to whisper_decode()
@ -365,6 +365,7 @@ extern "C" {

        // for auto-detection, set to nullptr, "" or "auto"
        const char * language;
+        bool detect_language;

        // common decoding parameters:
        bool suppress_blank;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
Author	SHA1	Message	Date
Georgi Gerganov	a5defbc1b9	release : v1.4.2	2023-05-14 19:06:45 +03:00
Georgi Gerganov	aaf0d41c7c	ggml : add AVX dot products	2023-05-14 18:56:46 +03:00
Georgi Gerganov	0cb820e0f9	talk-llama : fix build + sync latest llama.cpp	2023-05-14 18:46:42 +03:00
Jhen-Jie Hong	16564f554f	readme : improve Core ML model conversion guidance (#915 )	2023-05-14 18:11:08 +03:00
Georgi Gerganov	fd01209d09	coreml : support quantized model files	2023-05-14 18:09:44 +03:00
Georgi Gerganov	e693074aa6	ggml : sync latest ggml - New Q4 and Q5 formats - Various improvements	2023-05-14 18:04:23 +03:00
Rich Jones	d652cf12ec	main : fix help for --no-timestamps arg (#908 )	2023-05-14 17:54:57 +03:00
Georgi Gerganov	2b6a074305	extra : update ggml sync script	2023-05-14 10:01:52 +03:00
Jhen-Jie Hong	5300117471	whisper.objc : enable Core ML in example & fix segmentation fault (#910 ) * coreml : update endcoder header import path * coreml : force objc_arc in whisper-encoder.mm * whisper.objc : create coreml/ group link * whisper.objc : add coreml model link * whisper.objc : update readme * coreml : use -fobjc-arc for coreml/whisper-encoder.mm * ci: create dummy .mlmodelc for pass ios build * whisper.objc : update readme --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-05-14 09:47:02 +03:00
Georgi Gerganov	70af52a316	coreml : fix seg fault, double free (#919 , #917 , #899 )	2023-05-14 09:42:19 +03:00
Georgi Gerganov	1d17cd5bb3	coreml : fix memory leak (#899 )	2023-05-09 18:38:12 +03:00
Jonathan Soo	bf2449dfae	cmake : fix define used for COREML_ALLOW_FALLBACK (#893 )	2023-05-08 21:08:09 +03:00
Luis Herrera	4e4d00c67a	talk-llama : only copy used KV cache in get / set state (#890 ) --------- Co-authored-by: ejones <evan.q.jones@gmail.com>	2023-05-08 20:59:21 +03:00
Clifford Heath	9931d66400	readme : add instructions on converting to GGML + "--no-config" to wget (#874 )	2023-05-08 20:58:36 +03:00
ZaBlazzingZephyrus	1a548c048e	cmake : fix options disabling AVX and AVX2 flags (#885 )	2023-05-08 20:45:53 +03:00
Georgi Gerganov	14bee39b29	cmake : add options to disable CPU flags (#860 )	2023-05-04 19:31:04 +03:00
RelatedTitle	d458fcbc15	ci : add cuBLAS build workflow and fix error causing lines in CMakeLists (#867 ) * Add windows build with cuBLAS * Remove error causing lines for cuBLAS on Windows	2023-05-03 23:47:37 +03:00
Vulcan	919e58b96a	readme : partial OpenCL GPU support via CLBlast (#863 ) * ggml : CLBlast support as in llama.cpp Building with CLBlast speeds up whisper.cpp ~2x on low end / older AMD APUs (CPU with integrated GPU) such as the A9. Usage: WHISPER_CLBLAST=1 make * CMake/Makefile : CLBlast support as in llama.cpp Building with CLBlast speeds up whisper.cpp ~2x on low end / older AMD APUs (CPU with integrated GPU) such as the A9. Usage: ``` Makefile: cd whisper.cpp WHISPER_CLBLAST=1 make CMake: cd whisper.cpp ; mkdir build ; cd build cmake -DWHISPER_CLBLAST=ON .. make ``` * Update README.md Added OpenCL Build Instructions * Instruction: Partial OpenCL GPU support via CLBlast Added build instructions and examples for Make and CMake to support OpenCL enabled GPUs.	2023-05-03 19:24:43 +03:00
Vulcan	05bef0f0e9	build : CLBlast support as in llama.cpp (#862 ) * ggml : CLBlast support as in llama.cpp Building with CLBlast speeds up whisper.cpp ~2x on low end / older AMD APUs (CPU with integrated GPU) such as the A9. Usage: WHISPER_CLBLAST=1 make * CMake/Makefile : CLBlast support as in llama.cpp Building with CLBlast speeds up whisper.cpp ~2x on low end / older AMD APUs (CPU with integrated GPU) such as the A9. Usage: ``` Makefile: cd whisper.cpp WHISPER_CLBLAST=1 make CMake: cd whisper.cpp ; mkdir build ; cd build cmake -DWHISPER_CLBLAST=ON .. make ```	2023-05-02 22:50:32 +03:00
Georgi Gerganov	5974c8facd	ggml : fix 32-bit ARM build + quantization	2023-05-02 21:52:26 +03:00
Georgi Gerganov	0bcb64b184	ggml : sync ggml (clBLAST + tensor names)	2023-05-02 21:24:18 +03:00
Luis Herrera	0bf680fea2	talk-llama : fix session prompt load (#854 )	2023-05-02 20:05:27 +03:00
CRD716	b806420873	whisper : add detect-language mode (#853 ) * add detectlanguage flag * renaming and help * no idea why that last one didn't commit * run language detection if dl is set * help message fix * various fixes * fix quitting * fix language being english on print	2023-05-02 19:51:52 +03:00
Luis Herrera	be5911a9f3	talk-llama : add --session support (#845 ) * feat: adding session support * readme: adding --session info in examples/talk-llama * llama: adding session fixes * readme: updating session doc * talk-llama: update the value of need_to_save_session to true in order to save the session in the subsequent interaction * talk-llama: adding missing function which updates session_tokens	2023-05-01 20:18:10 +03:00
Georgi Gerganov	d375d73b2e	bench : improve benchmarks	2023-05-01 14:44:39 +03:00
Georgi Gerganov	7765770f89	whisper : add memory sizes for Q8_0 (close #846 )	2023-05-01 10:03:56 +03:00
Baffin Lee	872a85ae94	whisper.wasm : fix typo in readme (#832 )	2023-05-01 09:28:05 +03:00
Georgi Gerganov	9c61f5f585	release : v1.4.1	2023-04-30 22:57:42 +03:00
Georgi Gerganov	c94c469592	whisper : fix quantize bug (#842 ) * whisper : debug * whisper : fix bug during quantization	2023-04-30 22:50:04 +03:00
Georgi Gerganov	feac80dd3f	ggml : fix UB (int << 31)	2023-04-30 22:27:30 +03:00
Georgi Gerganov	fa8dbdc888	release : v1.4.0	2023-04-30 19:23:37 +03:00
Georgi Gerganov	4a7d49af95	examples : fix + refactor Levenshtein distance	2023-04-30 19:12:49 +03:00
Georgi Gerganov	794b162a46	whisper : add integer quantization support (#540 ) * whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples	2023-04-30 18:51:57 +03:00
Georgi Gerganov	5fd1bdd7fc	whisper : add GPU support via cuBLAS (#834 ) * make : add WHISPER_CUBLAS * make : fix CUBLAS build * whisper : disable Flash Attention + adjust memory buffers * whisper : remove old commented code * readme : add cuBLAS instructions * cmake : add WHISPER_CUBLAS option * gitignore : ignore build-cublas	2023-04-30 12:14:33 +03:00
Georgi Gerganov	0ccd6746c9	ggml : fix WASM build	2023-04-29 21:37:23 +03:00
Georgi Gerganov	d9b550c0a1	ggml : fix 32-bit ARM NEON (#836 ) * ggml : add support for 32-bit ARM * ggml : fix * ggml : fix	2023-04-29 21:33:33 +03:00
Georgi Gerganov	e9b091c92a	ggml : use vzip instead of vuzp for consistency	2023-04-29 21:14:09 +03:00
Georgi Gerganov	1f30b99208	ggml : fix WASM build	2023-04-29 20:21:25 +03:00
Georgi Gerganov	05c3ea3bc8	ggml : sync with ggml repo (warning fixes + asserts)	2023-04-29 19:33:28 +03:00
Thijs Raymakers	6108d3cc58	whisper : use correct seek_end when offset is used (#833 ) Whenever an `offset_ms` is provided, the value of `seek_end` is calculated incorrectly. This causes Whisper to keep transcribing after the end of the file. The current behavior looks like ``` [00:34:40.000 --> 00:34:47.000] This is an example audio file. [00:34:47.000 --> 00:34:49.000] The text has been redacted [00:34:49.000 --> 00:34:51.000] This is the end of the audio. [00:34:51.000 --> 00:34:52.000] * [00:34:52.000 --> 00:34:53.000] * [00:34:53.000 --> 00:34:54.000] * [00:34:55.000 --> 00:34:56.000] * ... ``` The expected behavior should be ``` [00:34:40.000 --> 00:34:47.000] This is an example audio file. [00:34:47.000 --> 00:34:49.000] The text has been redacted [00:34:49.000 --> 00:34:51.000] This is the end of the audio. - end of program - ``` This commit changes the calculation of the `seek_end` variable to only add `seek_start` if a custom `duration_ms` is provided. Otherwise, it defaults to the end of the file. Signed-off-by: Thijs Raymakers <thijs@raymakers.nl>	2023-04-29 18:55:37 +03:00
Georgi Gerganov	bab97c83d0	tests : add "threads" to run-tests.sh	2023-04-29 12:32:28 +03:00
Georgi Gerganov	3eaeb030ff	extra : add sync-ggml.sh script	2023-04-29 12:32:28 +03:00
Georgi Gerganov	acec73ab6e	ggml : sync latest ggml + llama.cpp updates (quantization)	2023-04-29 12:32:28 +03:00
Zollner	5cc17418c7	whisper.android : add some tips (#816 )	2023-04-29 11:00:20 +03:00
Georgi Gerganov	3efb81dec6	build : add WHISPER_COREML_ALLOW_FALLBACK to make / CMake (#812 )	2023-04-29 10:55:24 +03:00
Canis Lupus	94a7cd2a07	whisper : allow non-CoreML fallback when Core ML cannot be loaded (#812 ) if the Core ML model cannot be loaded, continue without Core ML instead of returning. This allows a single build to transcribe using Core ML models where available, and regular models when not.	2023-04-29 10:49:02 +03:00
Georgi Gerganov	3e82ff4747	whisper : fix bug from previous commit	2023-04-29 10:42:14 +03:00
Georgi Gerganov	b5bd2f43c5	whisper : avoid designated initializers	2023-04-29 10:36:50 +03:00
AsukaMinato	94aa56f19e	minor : improve C++ and Python style (#768 ) * use some STL functions * use self.field than setattr, use pathlib.Path * recover some format * const some iter * Keep the original * 2 space	2023-04-29 10:06:25 +03:00
Georgi Gerganov	4d89ee2e59	readme : add logo	2023-04-28 22:41:29 +03:00
Laytan Laats	70567eff23	main : escape quotes in csv output (#815 )	2023-04-23 19:01:59 +03:00
Taras Glek	02ec83c5d5	stream : flush upon finishing inference (#811 )	2023-04-23 17:00:30 +03:00
Philipp Zabel	2bd4b8d577	examples : add missing #include <cstdint> (#798 ) common.cpp uses uint8_t and uint64_t, which are defined in <cstdint>.	2023-04-23 16:52:52 +03:00
Tauseef Mohiuddin	eecf2c3d41	main : update escape_double_quotes() function (#776 ) Updated the escape_double_quotes() function such that the function now escapes both double quotes and backslashes in the input string. Changes Made: - Renamed the function to escape_quotes_and_backslashes - Modified the condition in the first loop to increment the value of 'escaped_length' for both double quotes and backslashes. - Modified the condition in second loop to add a backslash before the current character if it is a double quote or a backslash. Resolves: #769	2023-04-23 16:47:30 +03:00