whisper : add integer quantization support (#540)

* whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples
2025-07-11 13:05:03 +02:00 · 2023-04-30 18:51:57 +03:00
parent 5fd1bdd7fc
commit 794b162a46
41 changed files with 3183 additions and 1010 deletions
--- a/.gitignore
+++ b/.gitignore
@ -23,6 +23,7 @@ build-sanitize-thread/
 /talk
 /talk-llama
 /bench
 /quantize
 arm_neon.h
 sync.sh
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -303,6 +303,12 @@ if (BUILD_SHARED_LIBS)
    target_compile_definitions(${TARGET} PUBLIC
        WHISPER_SHARED
        GGML_SHARED
        )
    target_compile_definitions(${TARGET} PRIVATE
        WHISPER_BUILD
        GGML_BUILD
        )
 endif()
--- a/9
+++ b/9
@ -1,4 +1,4 @@
-default: main bench
+default: main bench quantize
 ifndef UNAME_S
 UNAME_S := $(shell uname -s)
@ -243,7 +243,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
 clean:
-	rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
 #
 # Examples
@ -251,7 +251,7 @@ clean:
 CC_SDL=`sdl2-config --cflags --libs`
-SRC_COMMON = examples/common.cpp
+SRC_COMMON     = examples/common.cpp examples/common-ggml.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp
 main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
@ -261,6 +261,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
 bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
 quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
 	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o quantize $(LDFLAGS)
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
--- a/README.md
+++ b/README.md
@ -15,6 +15,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
 - AVX intrinsics support for x86 architectures
 - VSX intrinsics support for POWER architectures
 - Mixed F16 / F32 precision
 - [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
 - Low memory usage (Flash Attention)
 - Zero memory allocations at runtime
 - Runs on the CPU
@ -228,6 +229,22 @@ make large
 | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
 | large  | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
 ## Quantization
 `whisper.cpp` supports integer quantization of the Whisper `ggml` models.
 Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.
 Here are the steps for creating and using a quantized model:
 ```bash
 # quantize a model with Q5_0 method
 make quantize
 ./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
 # run the examples as usual, specifying the quantized model file
 ./main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
 ```
 ## Core ML support
 On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
--- a/bindings/javascript/whisper.js
+++ b/bindings/javascript/whisper.js
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -21,10 +21,14 @@ set(TARGET common)
 add_library(${TARGET} STATIC
    common.h
    common.cpp
    common-ggml.h
    common-ggml.cpp
    )
 include(DefaultTargetOptions)
 target_link_libraries(${TARGET} PRIVATE whisper)
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 if (WHISPER_SDL2)
@ -62,6 +66,7 @@ else()
    add_subdirectory(stream)
    add_subdirectory(command)
    add_subdirectory(bench)
    add_subdirectory(quantize)
    add_subdirectory(talk)
    add_subdirectory(talk-llama)
 endif()
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -18,5 +18,6 @@ describe("Run whisper.node", () => {
        let result = await whisperAsync(whisperParamsMock);
        expect(result.length).toBeGreaterThan(0);
-  });
+    }, 10000);
 });
--- a/examples/bench.wasm/CMakeLists.txt
+++ b/examples/bench.wasm/CMakeLists.txt
@ -31,9 +31,9 @@ endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
+    -s PTHREAD_POOL_SIZE_STRICT=0 \
-    -s INITIAL_MEMORY=1024MB \
+    -s INITIAL_MEMORY=2000MB \
-    -s TOTAL_MEMORY=1024MB \
+    -s TOTAL_MEMORY=2000MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/bench.wasm/index-tmpl.html
+++ b/examples/bench.wasm/index-tmpl.html
@ -35,6 +35,15 @@
            <br><br>
            <b>More examples:</b>
                <a href="https://whisper.ggerganov.com/">main</a> |
                <a href="https://whisper.ggerganov.com/bench">bench</a> |
                <a href="https://whisper.ggerganov.com/stream">stream</a> |
                <a href="https://whisper.ggerganov.com/command">command</a> |
                <a href="https://whisper.ggerganov.com/talk">talk</a> |
            <br><br>
            <hr>
            Select the model you would like to use and click the "Bench" button.<br>
@ -46,9 +55,16 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
-                <span id="fetch-whisper-progress"></span>
+                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <button id="fetch-whisper-small-en-q5_1"  onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
                <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
                <button id="fetch-whisper-large-q5_0"     onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
                <span id="fetch-whisper-progress"></span>
            </div>
            <br>
@ -160,6 +176,14 @@
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
                document.getElementById('fetch-whisper-small-en').style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
                document.getElementById('whisper-file'         ).style.display = 'none';
                document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
            }
@ -168,11 +192,25 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                    'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
                    'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
                    'large-q5_0':    'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
                };
                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
                    'small.en': 466,
                    'tiny-en-q5_1':   31,
                    'base-en-q5_1':   57,
                    'small-en-q5_1':  182,
                    'medium-en-q5_0': 515,
                    'large-q5_0':     1030,
                };
                let url     = urls[model];
@ -181,6 +219,15 @@
                document.getElementById('fetch-whisper-tiny-en').style.display  = 'none';
                document.getElementById('fetch-whisper-base-en').style.display  = 'none';
                document.getElementById('fetch-whisper-small-en').style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
                document.getElementById('whisper-file'        ).style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
@ -192,6 +239,15 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en');  if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en');  if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'  ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q5_1'  ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-large-q5_0'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('whisper-file'        ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
                };
--- a/examples/command.wasm/index-tmpl.html
+++ b/examples/command.wasm/index-tmpl.html
@ -35,6 +35,15 @@
            <br><br>
            <b>More examples:</b>
                <a href="https://whisper.ggerganov.com/">main</a> |
                <a href="https://whisper.ggerganov.com/bench">bench</a> |
                <a href="https://whisper.ggerganov.com/stream">stream</a> |
                <a href="https://whisper.ggerganov.com/command">command</a> |
                <a href="https://whisper.ggerganov.com/talk">talk</a> |
            <br><br>
            <hr>
            Select the model you would like to use, click the "Start" button and follow the instructions.
@ -45,6 +54,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <!--
@ -162,11 +175,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };
                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
                    'tiny-en-q5_1':   31,
                    'base-en-q5_1':   57,
                };
                let url     = urls[model];
@ -177,6 +196,10 @@
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
@ -188,6 +211,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };
--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@ -0,0 +1,241 @@
 #include "common-ggml.h"
 #include <regex>
 #include <map>
 static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
    {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
 };
 void ggml_print_ftypes(FILE * fp) {
    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
    }
 }
 enum ggml_ftype ggml_parse_ftype(const char * str) {
    enum ggml_ftype ftype;
    if (str[0] == 'q') {
        const auto it = GGML_FTYPE_MAP.find(str);
        if (it == GGML_FTYPE_MAP.end()) {
            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
            return GGML_FTYPE_UNKNOWN;
        }
        ftype = it->second;
    } else {
        ftype = (enum ggml_ftype) atoi(str);
    }
    return ftype;
 }
 bool ggml_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
        const ggml_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip) {
    ggml_type qtype = GGML_TYPE_F32;
    switch (ftype) {
        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
        case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
        case GGML_FTYPE_UNKNOWN:
        case GGML_FTYPE_ALL_F32:
        case GGML_FTYPE_MOSTLY_F16:
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
                {
                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
                    return false;
                }
    };
    if (!ggml_is_quantized(qtype)) {
        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
        return false;
    }
    size_t total_size_org = 0;
    size_t total_size_new = 0;
    std::vector<float> work;
    std::vector<uint8_t>     data_u8;
    std::vector<ggml_fp16_t> data_f16;
    std::vector<float>       data_f32;
    std::vector<int64_t> hist_all(1 << 4, 0);
    while (true) {
        int32_t n_dims;
        int32_t length;
        int32_t ttype;
        finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
        finp.read(reinterpret_cast<char *>(&length), sizeof(length));
        finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
        if (finp.eof()) {
            break;
        }
        int32_t nelements = 1;
        int32_t ne[2] = { 1, 1 };
        for (int i = 0; i < n_dims; ++i) {
            finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
            nelements *= ne[i];
        }
        std::string name(length, 0);
        finp.read (&name[0], length);
        printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype));
        bool quantize = false;
        // check if we should quantize this tensor
        for (const auto & s : to_quant) {
            if (std::regex_match(name, std::regex(s))) {
                quantize = true;
                break;
            }
        }
        // check if we should skip this tensor
        for (const auto & s : to_skip) {
            if (std::regex_match(name, std::regex(s))) {
                quantize = false;
                break;
            }
        }
        // quantize only 2D tensors
        quantize &= (n_dims == 2);
        if (quantize) {
            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
                return false;
            }
            if (ttype == GGML_TYPE_F16) {
                data_f16.resize(nelements);
                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
                data_f32.resize(nelements);
                for (int i = 0; i < nelements; ++i) {
                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
                }
            } else {
                data_f32.resize(nelements);
                finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
            }
            ttype = qtype;
        } else {
            const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
            data_u8.resize(nelements*bpe);
            finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
        }
        fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
        fout.write(reinterpret_cast<char *>(&length), sizeof(length));
        fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
        for (int i = 0; i < n_dims; ++i) {
            fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
        }
        fout.write(&name[0], length);
        if (quantize) {
            work.resize(nelements); // for quantization
            size_t cur_size = 0;
            std::vector<int64_t> hist_cur(1 << 4, 0);
            switch ((ggml_type) ttype) {
                case GGML_TYPE_Q4_0:
                    {
                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_Q4_1:
                    {
                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_Q4_2:
                    {
                        cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_Q5_0:
                    {
                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_Q5_1:
                    {
                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_Q8_0:
                    {
                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                    } break;
                case GGML_TYPE_F32:
                case GGML_TYPE_F16:
                case GGML_TYPE_I8:
                case GGML_TYPE_I16:
                case GGML_TYPE_I32:
                case GGML_TYPE_Q8_1:
                case GGML_TYPE_COUNT:
                    {
                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
                        return false;
                    }
            }
            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
            total_size_new += cur_size;
            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
            for (int i = 0; i < hist_cur.size(); ++i) {
                hist_all[i] += hist_cur[i];
            }
            for (int i = 0; i < hist_cur.size(); ++i) {
                printf("%5.3f ", hist_cur[i] / (float)nelements);
            }
            printf("\n");
        } else {
            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
            total_size_new += data_u8.size();
        }
        total_size_org += nelements * sizeof(float);
    }
    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
    {
        int64_t sum_all = 0;
        for (int i = 0; i < hist_all.size(); ++i) {
            sum_all += hist_all[i];
        }
        printf("%s: hist: ", __func__);
        for (int i = 0; i < hist_all.size(); ++i) {
            printf("%5.3f ", hist_all[i] / (float)sum_all);
        }
        printf("\n");
    }
    return true;
 }
--- a/examples/common-ggml.h
+++ b/examples/common-ggml.h
@ -0,0 +1,18 @@
 #pragma once
 #include "ggml.h"
 #include <fstream>
 #include <vector>
 #include <string>
 enum ggml_ftype ggml_parse_ftype(const char * str);
 void ggml_print_ftypes(FILE * fp = stderr);
 bool ggml_common_quantize_0(
        std::ifstream & finp,
        std::ofstream & fout,
        const ggml_ftype ftype,
        const std::vector<std::string> & to_quant,
        const std::vector<std::string> & to_skip);
--- a/examples/common.cpp
+++ b/examples/common.cpp
@ -6,13 +6,86 @@
 #include "dr_wav.h"
 #include <cmath>
-#include <cstdint>
+#include <fstream>
 #include <regex>
 #ifndef M_PI
 #define M_PI 3.14159265358979323846
 #endif
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
        if (arg == "-s" || arg == "--seed") {
            params.seed = std::stoi(argv[++i]);
        } else if (arg == "-t" || arg == "--threads") {
            params.n_threads = std::stoi(argv[++i]);
        } else if (arg == "-p" || arg == "--prompt") {
            params.prompt = argv[++i];
        } else if (arg == "-n" || arg == "--n_predict") {
            params.n_predict = std::stoi(argv[++i]);
        } else if (arg == "--top_k") {
            params.top_k = std::stoi(argv[++i]);
        } else if (arg == "--top_p") {
            params.top_p = std::stof(argv[++i]);
        } else if (arg == "--temp") {
            params.temp = std::stof(argv[++i]);
        } else if (arg == "-b" || arg == "--batch_size") {
            params.n_batch = std::stoi(argv[++i]);
        } else if (arg == "-m" || arg == "--model") {
            params.model = argv[++i];
        } else if (arg == "-h" || arg == "--help") {
            gpt_print_usage(argc, argv, params);
            exit(0);
        } else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            gpt_print_usage(argc, argv, params);
            exit(0);
        }
    }
    return true;
 }
 void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h, --help            show this help message and exit\n");
    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
    fprintf(stderr, "  -m FNAME, --model FNAME\n");
    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
    fprintf(stderr, "\n");
 }
 std::string gpt_random_prompt(std::mt19937 & rng) {
    const int r = rng() % 10;
    switch (r) {
        case 0: return "So";
        case 1: return "Once upon a time";
        case 2: return "When";
        case 3: return "The";
        case 4: return "After";
        case 5: return "If";
        case 6: return "import";
        case 7: return "He";
        case 8: return "She";
        case 9: return "They";
        default: return "To";
    }
    return "The";
 }
 std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
@ -28,6 +101,251 @@ std::string replace(const std::string & s, const std::string & from, const std::
    return result;
 }
 std::map<std::string, int32_t> json_parse(const std::string & fname) {
    std::map<std::string, int32_t> result;
    // read file into string
    std::string json;
    {
        std::ifstream ifs(fname);
        if (!ifs) {
            fprintf(stderr, "Failed to open %s\n", fname.c_str());
            exit(1);
        }
        json = std::string((std::istreambuf_iterator<char>(ifs)),
                (std::istreambuf_iterator<char>()));
    }
    if (json[0] != '{') {
        return result;
    }
    // parse json
    {
        bool has_key  = false;
        bool in_token = false;
        std::string str_key = "";
        std::string str_val = "";
        int n = json.size();
        for (int i = 1; i < n; ++i) {
            if (!in_token) {
                if (json[i] == ' ') continue;
                if (json[i] == '"') {
                    in_token = true;
                    continue;
                }
            } else {
                if (json[i] == '\\' && i+1 < n) {
                    if (has_key == false) {
                        str_key += json[i];
                    } else {
                        str_val += json[i];
                    }
                    ++i;
                } else if (json[i] == '"') {
                    if (has_key == false) {
                        has_key = true;
                        ++i;
                        while (json[i] == ' ') ++i;
                        ++i; // :
                        while (json[i] == ' ') ++i;
                        if (json[i] != '\"') {
                            while (json[i] != ',' && json[i] != '}') {
                                str_val += json[i++];
                            }
                            has_key = false;
                        } else {
                            in_token = true;
                            continue;
                        }
                    } else {
                        has_key = false;
                    }
                    str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
                    str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
                    str_key = ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
                    try {
                        result[str_key] = std::stoi(str_val);
                    } catch (...) {
                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
                    }
                    str_key = "";
                    str_val = "";
                    in_token = false;
                    continue;
                }
                if (has_key == false) {
                    str_key += json[i];
                } else {
                    str_val += json[i];
                }
            }
        }
    }
    return result;
 }
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
    std::vector<std::string> words;
    // first split the text into words
    {
        std::string str = text;
        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
        std::regex re(pat);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            for (auto x : m) {
                words.push_back(x);
            }
            str = m.suffix();
        }
    }
    // find the longest tokens that form the words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
        if (word.size() == 0) continue;
        int i = 0;
        int n = word.size();
        while (i < n) {
            int j = n;
            while (j > i) {
                auto it = vocab.token_to_id.find(word.substr(i, j-i));
                if (it != vocab.token_to_id.end()) {
                    tokens.push_back(it->second);
                    i = j;
                    break;
                }
                --j;
            }
            if (i == n) {
                break;
            }
            if (j == i) {
                auto sub = word.substr(i, 1);
                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
                    tokens.push_back(vocab.token_to_id.at(sub));
                } else {
                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
                }
                ++i;
            }
        }
    }
    return tokens;
 }
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
    vocab.token_to_id = ::json_parse(fname);
    for (const auto & kv : vocab.token_to_id) {
        vocab.id_to_token[kv.second] = kv.first;
    }
    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
    // print the vocabulary
    //for (auto kv : vocab.token_to_id) {
    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
    //}
    return true;
 }
 gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const float * logits,
        int    top_k,
        double top_p,
        double temp,
        std::mt19937 & rng) {
    int n_logits = vocab.id_to_token.size();
    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);
    {
        const double scale = 1.0/temp;
        for (int i = 0; i < n_logits; ++i) {
            logits_id.push_back(std::make_pair(logits[i]*scale, i));
        }
    }
    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });
    logits_id.resize(top_k);
    double maxl = -INFINITY;
    for (const auto & kv : logits_id) {
        maxl = std::max(maxl, kv.first);
    }
    // compute probs for the top K tokens
    std::vector<double> probs;
    probs.reserve(logits_id.size());
    double sum = 0.0;
    for (const auto & kv : logits_id) {
        double p = exp(kv.first - maxl);
        probs.push_back(p);
        sum += p;
    }
    // normalize the probs
    for (auto & p : probs) {
        p /= sum;
    }
    if (top_p < 1.0f) {
        double cumsum = 0.0f;
        for (int i = 0; i < top_k; i++) {
            cumsum += probs[i];
            if (cumsum >= top_p) {
                top_k = i + 1;
                probs.resize(top_k);
                logits_id.resize(top_k);
                break;
            }
        }
        cumsum = 1.0/cumsum;
        for (int i = 0; i < (int) probs.size(); i++) {
            probs[i] *= cumsum;
        }
    }
    //printf("\n");
    //for (int i = 0; i < (int) probs.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
    //}
    //exit(0);
    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);
    return logits_id[idx].second;
 }
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    drwav wav;
    std::vector<uint8_t> wav_data; // used for pipe input from stdin
--- a/examples/common.h
+++ b/examples/common.h
@ -1,10 +1,44 @@
 // Various helper functions and utilities
 #pragma once
-// needs to match WHISPER_SAMPLE_RATE
+#include <string>
 #include <map>
 #include <vector>
 #include <random>
 #include <thread>
 #define COMMON_SAMPLE_RATE 16000
-#include <vector>
+//
-#include <string>
+// CLI argument parsing
 //
 struct gpt_params {
    int32_t seed      = -1; // RNG seed
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_predict = 200; // new tokens to predict
    // sampling parameters
    int32_t top_k = 40;
    float   top_p = 0.9f;
    float   temp  = 0.9f;
    int32_t n_batch = 8; // batch size for prompt processing
    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
    std::string prompt;
 };
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 std::string gpt_random_prompt(std::mt19937 & rng);
 //
 // Vocab utils
 //
 std::string trim(const std::string & s);
@ -13,6 +47,52 @@ std::string replace(
        const std::string & from,
        const std::string & to);
 struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;
    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
 };
 // poor-man's JSON parsing
 std::map<std::string, int32_t> json_parse(const std::string & fname);
 // split text into tokens
 //
 // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
 //
 // Regex (Python):
 // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
 //
 // Regex (C++):
 // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
 //
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
 // sample next token given probabilities for each embedding
 //
 //   - consider only the top K tokens
 //   - from them, consider only the top tokens with cumulative probability > P
 //
 // TODO: not sure if this implementation is correct
 // TODO: temperature is not implemented
 //
 gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const float * logits,
        int    top_k,
        double top_p,
        double temp,
        std::mt19937 & rng);
 //
 // Audio utils
 //
 // Read WAV audio file and store the PCM data into pcmf32
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -145,7 +145,15 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
                            var db = event.target.result;
                            var tx = db.transaction(['models'], 'readwrite');
                            var os = tx.objectStore('models');
                            var rq = null;
                            try {
                                var rq = os.put(data, url);
                            } catch (e) {
                                cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB: \n' + e);
                                cbCancel();
                                return;
                            }
                            rq.onsuccess = function (event) {
                                cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
@ -180,7 +188,6 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
    rq.onabort = function (event) {
        cbPrint('loadRemote: failed to open IndexedDB: abort');
-
+        cbCancel();
    };
 }
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -496,7 +496,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
                value_i("layer", whisper_model_n_text_layer(ctx), true);
            end_obj();
            value_i("mels", whisper_model_n_mels(ctx));
-            value_i("f16", whisper_model_f16(ctx), true);
+            value_i("ftype", whisper_model_ftype(ctx), true);
        end_obj();
        start_obj("params");
            value_s("model", params.model.c_str());
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@ -0,0 +1,6 @@
 set(TARGET quantize)
 add_executable(${TARGET} quantize.cpp)
 include(DefaultTargetOptions)
 target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@ -0,0 +1,3 @@
 # quantize
 Tool for integer quantization of Whisper `ggml` model files
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@ -0,0 +1,215 @@
 #include "ggml.h"
 #include "common.h"
 #include "common-ggml.h"
 #include <cassert>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <fstream>
 #include <map>
 #include <string>
 #include <vector>
 #include <regex>
 // default hparams (Whisper tiny)
 struct whisper_hparams {
    int32_t n_vocab       = 51864;
    int32_t n_audio_ctx   = 1500;
    int32_t n_audio_state = 384;
    int32_t n_audio_head  = 6;
    int32_t n_audio_layer = 4;
    int32_t n_text_ctx    = 448;
    int32_t n_text_state  = 384;
    int32_t n_text_head   = 6;
    int32_t n_text_layer  = 4;
    int32_t n_mels        = 80;
    int32_t f16           = 1;
 };
 struct whisper_filters {
    int32_t n_mel;
    int32_t n_fft;
    std::vector<float> data;
 };
 // quantize a model
 bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
    gpt_vocab vocab;
    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
    auto finp = std::ifstream(fname_inp, std::ios::binary);
    if (!finp) {
        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
        return false;
    }
    auto fout = std::ofstream(fname_out, std::ios::binary);
    if (!fout) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
        return false;
    }
    // verify magic
    {
        uint32_t magic;
        finp.read((char *) &magic, sizeof(magic));
        if (magic != 0x67676d6c) {
            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
            return false;
        }
        fout.write((char *) &magic, sizeof(magic));
    }
    whisper_hparams hparams;
    // load hparams
    {
        finp.read((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
        finp.read((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
        finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
        finp.read((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
        finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
        finp.read((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
        finp.read((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
        finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
        finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
        finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
        finp.read((char *) &hparams.f16,           sizeof(hparams.f16));
        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
        fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
        fprintf(stderr, "%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
        fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
        fprintf(stderr, "%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
        fprintf(stderr, "%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
        fprintf(stderr, "%s: f16           = %d\n", __func__, hparams.f16);
        fout.write((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
        fout.write((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
        fout.write((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
        fout.write((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
        fout.write((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
        fout.write((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
        fout.write((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
        fout.write((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
        fout.write((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
        fout.write((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
        fout.write((char *) &ftype,                 sizeof(hparams.f16));
    }
    // load mel filters
    {
        whisper_filters filters;
        finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
        fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
        finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
        fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
        filters.data.resize(filters.n_mel * filters.n_fft);
        finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
        fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
    }
    // load vocab
    {
        int32_t n_vocab = 0;
        finp.read ((char *) &n_vocab, sizeof(n_vocab));
        fout.write((char *) &n_vocab, sizeof(n_vocab));
        //if (n_vocab != hparams.n_vocab) {
        //    fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
        //            __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
        //    return false;
        //}
        std::string word;
        for (int i = 0; i < n_vocab; i++) {
            uint32_t len;
            finp.read ((char *) &len, sizeof(len));
            fout.write((char *) &len, sizeof(len));
            word.resize(len);
            finp.read ((char *) word.data(), len);
            fout.write((char *) word.data(), len);
            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
        }
    }
    // regexes of tensor names to not be quantized
    const std::vector<std::string> to_skip = {
        //"encoder.*",
        "encoder.conv1.bias",
        "encoder.conv2.bias",
        "encoder.positional_embedding",
        "decoder.positional_embedding",
    };
    if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
        return false;
    }
    finp.close();
    fout.close();
    return true;
 }
 int main(int argc, char ** argv) {
    if (argc != 4) {
        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
        ggml_print_ftypes(stderr);
        return 1;
    }
    // needed to initialize f16 tables
    {
        struct ggml_init_params params = { 0, NULL, false };
        struct ggml_context * ctx = ggml_init(params);
        ggml_free(ctx);
    }
    const std::string fname_inp = argv[1];
    const std::string fname_out = argv[2];
    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
    const int64_t t_main_start_us = ggml_time_us();
    int64_t t_quantize_us = 0;
    // load the model
    {
        const int64_t t_start_us = ggml_time_us();
        if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
            return 1;
        }
        t_quantize_us = ggml_time_us() - t_start_us;
    }
    // report timing
    {
        const int64_t t_main_end_us = ggml_time_us();
        printf("\n");
        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
    }
    return 0;
 }
--- a/examples/stream.wasm/index-tmpl.html
+++ b/examples/stream.wasm/index-tmpl.html
@ -35,6 +35,15 @@
            <br><br>
            <b>More examples:</b>
                <a href="https://whisper.ggerganov.com/">main</a> |
                <a href="https://whisper.ggerganov.com/bench">bench</a> |
                <a href="https://whisper.ggerganov.com/stream">stream</a> |
                <a href="https://whisper.ggerganov.com/command">command</a> |
                <a href="https://whisper.ggerganov.com/talk">talk</a> |
            <br><br>
            <hr>
            Select the model you would like to use, click the "Start" button and start speaking
@ -45,6 +54,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <!--
@ -162,11 +175,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };
                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
                    'tiny-en-q5_1':   31,
                    'base-en-q5_1':   57,
                };
                let url     = urls[model];
@ -177,6 +196,10 @@
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
@ -188,6 +211,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };
--- a/examples/talk-llama/llama-util.h
+++ b/examples/talk-llama/llama-util.h
@ -21,12 +21,17 @@
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
        #endif
        #if defined(_POSIX_MEMLOCK_RANGE)
            #include <sys/resource.h>
        #endif
    #endif
 #endif
 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h> // for _fseeki64
@ -41,8 +46,12 @@
    } while (0)
 #ifdef __GNUC__
 #ifdef __MINGW32__
 __attribute__((format(gnu_printf, 1, 2)))
 #else
 __attribute__((format(printf, 1, 2)))
 #endif
 #endif
 static std::string format(const char * fmt, ...) {
    va_list ap, ap2;
    va_start(ap, fmt);
@ -55,7 +64,7 @@ static std::string format(const char * fmt, ...) {
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
-};
+}
 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
@ -162,7 +171,7 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;
-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
@ -170,17 +179,18 @@ struct llama_mmap {
        flags |= MAP_POPULATE;
 #endif
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
        close(fd);
        if (addr == MAP_FAILED) {
            throw format("mmap failed: %s", strerror(errno));
        }
        if (prefetch) {
            // Advise the kernel to preload the mapped memory
            if (madvise(addr, file->size, MADV_WILLNEED)) {
                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
                        strerror(errno));
            }
        }
    }
    ~llama_mmap() {
        munmap(addr, size);
@ -188,14 +198,13 @@ struct llama_mmap {
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;
-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;
        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
        DWORD error = GetLastError();
        CloseHandle(hFile);
        if (hMapping == NULL) {
            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@ -209,6 +218,8 @@ struct llama_mmap {
            throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
        }
        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
        if (prefetch) {
            // Advise the kernel to preload the mapped memory
            WIN32_MEMORY_RANGE_ENTRY range;
            range.VirtualAddress = addr;
@ -218,6 +229,10 @@ struct llama_mmap {
                        llama_format_win_err(GetLastError()).c_str());
            }
        }
        #else
        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
    }
    ~llama_mmap() {
        if (!UnmapViewOfFile(addr)) {
@ -291,8 +306,18 @@ struct llama_mlock {
        if (!mlock(addr, size)) {
            return true;
        } else {
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
+            char* errmsg = std::strerror(errno);
-                    size, this->size, std::strerror(errno));
+            bool suggest = (errno == ENOMEM);
            // Check if the resource limit is fine after all
            struct rlimit lock_limit;
            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
                suggest = false;
            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
                suggest = false;
            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
            return false;
        }
    }
@ -338,8 +363,8 @@ struct llama_mlock {
            // Hopefully a megabyte is enough overhead:
            size_t increment = size + 1048576;
            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += size;
+            min_ws_size += increment;
-            max_ws_size += size;
+            max_ws_size += increment;
            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
                        llama_format_win_err(GetLastError()).c_str());
@ -380,4 +405,29 @@ struct llama_buffer {
        delete[] addr;
    }
 };
 #ifdef GGML_USE_CUBLAS
 #include "ggml-cuda.h"
 struct llama_ctx_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;
    void resize(size_t size) {
        if (addr) {
            ggml_cuda_host_free(addr);
        }
        addr = (uint8_t *) ggml_cuda_host_malloc(size);
        this->size = size;
    }
    ~llama_ctx_buffer() {
        if (addr) {
            ggml_cuda_host_free(addr);
        }
    }
 };
 #else
 typedef llama_buffer llama_ctx_buffer;
 #endif
 #endif
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -39,12 +39,16 @@ extern "C" {
    typedef struct llama_token_data {
        llama_token id;  // token id
-
+        float logit; // log-odds of the token
        float p;     // probability of the token
        float plog;  // log probability of the token
    } llama_token_data;
    typedef struct llama_token_data_array {
        llama_token_data * data;
        size_t size;
        bool sorted;
    } llama_token_data_array;
    typedef void (*llama_progress_callback)(float progress, void *ctx);
    struct llama_context_params {
@ -65,6 +69,20 @@ extern "C" {
        void * progress_callback_user_data;
    };
    // model file types
    enum llama_ftype {
        LLAMA_FTYPE_ALL_F32     = 0,
        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
    };
    LLAMA_API struct llama_context_params llama_context_default_params();
    LLAMA_API bool llama_mmap_supported();
@ -82,27 +100,46 @@ extern "C" {
    // TODO: not great API - very likely to change
    // Returns 0 on success
    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-                   int   itype);
+      enum llama_ftype   ftype,
            int          nthread);
-    // Returns the KV cache that will contain the context for the
+    // Apply a LoRA adapter to a loaded model
-    // ongoing prediction with the model.
+    // path_base_model is the path to a higher quality model to use as a base for
-    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
-
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-    // Returns the size of the KV cache
+    // will be applied on top of the previous one
-    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+    // Returns 0 on success
    LLAMA_API int llama_apply_lora_from_file(
            struct llama_context * ctx,
                      const char * path_lora,
                      const char * path_base_model,
                             int   n_threads);
    // Returns the number of tokens in the KV cache
    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
-    // Sets the KV cache containing the current context for the model
+    // Sets the current rng seed.
-    LLAMA_API void llama_set_kv_cache(
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
-            struct llama_context * ctx,
+
-                   const uint8_t * kv_cache,
+    // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
-                          size_t   n_size,
+    LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
-                             int   n_token_count);
+
    // Copies the state to the specified destination address.
    // Destination needs to have allocated enough memory.
    // Returns the number of bytes copied
    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
    // Set the state reading from the specified address
    // Returns the number of bytes read
    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
    // Save/load session file
    LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
    LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
@ -148,16 +185,52 @@ extern "C" {
    // Special tokens
    LLAMA_API llama_token llama_token_bos();
    LLAMA_API llama_token llama_token_eos();
    LLAMA_API llama_token llama_token_nl();
-    // TODO: improve the last_n_tokens interface ?
+    // Sampling functions
-    LLAMA_API llama_token llama_sample_top_p_top_k(
+
-       struct llama_context * ctx,
+    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
-          const llama_token * last_n_tokens_data,
+    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
-                        int   last_n_tokens_size,
+
-                        int   top_k,
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
-                      float   top_p,
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
-                      float   temp,
+
-                      float   repeat_penalty);
+    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
    /// @details Selects the token with the highest probability.
    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
    /// @details Randomly selects a token from the candidates based on their probabilities.
    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
    // Performance information
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
@ -170,4 +243,15 @@ extern "C" {
 }
 #endif
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL
 #include <vector>
 #include <string>
 struct ggml_tensor;
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif
 #endif // LLAMA_H
--- a/examples/talk-llama/llama_internal.h
+++ b/examples/talk-llama/llama_internal.h
@ -1,12 +0,0 @@
 // Internal header to be included by llama.cpp and tests/benchmarks only.
 #ifndef LLAMA_INTERNAL_H
 #define LLAMA_INTERNAL_H
 #include <vector>
 #include <string>
 struct ggml_tensor;
 std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
 #endif // LLAMA_INTERNAL_H
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -487,11 +487,37 @@ int main(int argc, char ** argv) {
                        {
                            auto logits = llama_get_logits(ctx_llama);
                            auto n_vocab = llama_n_vocab(ctx_llama);
                            logits[llama_token_eos()] = 0;
-                            id = llama_sample_top_p_top_k(ctx_llama,
+                            std::vector<llama_token_data> candidates;
                            candidates.reserve(n_vocab);
                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
                            }
                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
                            // apply repeat penalty
                            const float nl_logit = logits[llama_token_nl()];
                            llama_sample_repetition_penalty(ctx_llama, &candidates_p,
                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
-                                    repeat_last_n, top_k, top_p, temp, repeat_penalty);
+                                    repeat_last_n, repeat_penalty);
                            logits[llama_token_nl()] = nl_logit;
                            if (temp <= 0) {
                                // Greedy sampling
                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
                            } else {
                                // Temperature sampling
                                llama_sample_top_k(ctx_llama, &candidates_p, top_k);
                                llama_sample_top_p(ctx_llama, &candidates_p, top_p);
                                llama_sample_temperature(ctx_llama, &candidates_p, temp);
                                id = llama_sample_token(ctx_llama, &candidates_p);
                            }
                        }
                        if (id != llama_token_eos()) {
--- a/examples/talk.wasm/CMakeLists.txt
+++ b/examples/talk.wasm/CMakeLists.txt
@ -13,6 +13,7 @@ include(DefaultTargetOptions)
 target_link_libraries(${TARGET} PRIVATE
    whisper
    common
    )
 unset(EXTRA_FLAGS)
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -1,4 +1,6 @@
 #include "ggml.h"
 #include "common-ggml.h"
 #include "gpt-2.h"
 #include <cmath>
@ -14,150 +16,6 @@
 /////////////////////// GPT-2 BEGIN /////////////////////////
 //
 // Vocab utils
 //
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
    std::vector<std::string> words;
    // first split the text into words
    {
        std::string str = text;
        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
        std::regex re(pat);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            for (auto x : m) {
                words.push_back(x);
            }
            str = m.suffix();
        }
    }
    // find the longest tokens that form the words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
        if (word.size() == 0) continue;
        int i = 0;
        int n = word.size();
        while (i < n) {
            int j = n;
            while (j > i) {
                auto it = vocab.token_to_id.find(word.substr(i, j-i));
                if (it != vocab.token_to_id.end()) {
                    tokens.push_back(it->second);
                    i = j;
                    break;
                }
                --j;
            }
            if (i == n) {
                break;
            }
            if (j == i) {
                auto sub = word.substr(i, 1);
                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
                    tokens.push_back(vocab.token_to_id.at(sub));
                } else {
                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
                }
                ++i;
            }
        }
    }
    return tokens;
 }
 gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const float * logits,
        int    top_k,
        double top_p,
        double temp,
        std::mt19937 & rng) {
    int n_logits = vocab.id_to_token.size();
    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);
    for (int i = 0; i < n_logits; i++) {
        logits_id.push_back(std::make_pair(logits[i], i));
    }
    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });
    logits_id.resize(top_k);
    // normalize
    {
        double sum = 0.0f;
        for (int i = 0; i < (int)logits_id.size(); i++) {
            sum += logits_id[i].first;
        }
        sum = 1.0/sum;
        for (int i = 0; i < (int)logits_id.size(); i++) {
            logits_id[i].first *= sum;
        }
    }
    if (top_p < 1.0f) {
        {
            double cumsum = 0.0f;
            for (int i = 0; i < top_k; i++) {
                cumsum += logits_id[i].first;
                if (cumsum >= top_p) {
                    logits_id.resize(i+1);
                    break;
                }
            }
        }
        // normalize again
        {
            double sum = 0.0f;
            for (int i = 0; i < (int)logits_id.size(); i++) {
                sum += logits_id[i].first;
            }
            sum = 1.0/sum;
            for (int i = 0; i < (int)logits_id.size(); i++) {
                logits_id[i].first *= sum;
            }
        }
    }
    //printf("\n");
    //for (int i = 0; i < (int)logits_id.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
    //}
    //exit(0);
    // sample from the obtained distribution
    std::vector<double> probs;
    probs.reserve(logits_id.size());
    for (int i = 0; i < (int) logits_id.size(); i++) {
        probs.push_back(logits_id[i].first);
    }
    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);
    return logits_id[idx].second;
 }
 // default hparams (GPT-2 117M)
 struct gpt2_hparams {
    int32_t n_vocab = 50257;
@ -165,7 +23,7 @@ struct gpt2_hparams {
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };
 struct gpt2_layer {
@ -187,7 +45,7 @@ struct gpt2_layer {
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;
-    struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
+    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
 };
@ -200,6 +58,7 @@ struct gpt2_model {
    struct ggml_tensor * wte;     // position embedding
    struct ggml_tensor * wpe;     //    token embedding
    struct ggml_tensor * lm_head; // language model head
    std::vector<gpt2_layer> layers;
@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    }
    // load vocab
@ -275,9 +134,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        }
    }
-    // for the big tensors, we have the option to store the data in 16-bit floats
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
    if (wtype == GGML_TYPE_COUNT) {
        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                __func__, fname.c_str(), model.hparams.ftype);
        return false;
    }
    auto & ctx = model.ctx;
@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-        ctx_size += n_vocab*n_embd*ggml_type_size(wtype);         // wte
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_size(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
        ctx_size += (6 + 12*n_layer)*256; // object overhead
@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    // create the ggml context
    {
-        struct ggml_init_params params;
+        struct ggml_init_params params = {
-        params.mem_size   = ctx_size;
+            .mem_size   = ctx_size,
-        params.mem_buffer = NULL;
+            .mem_buffer = NULL,
            .no_alloc   = false,
        };
        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -352,6 +219,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        // map by name
        model.tensors["model/ln_f/g"] = model.ln_f_g;
@ -359,6 +227,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.tensors["model/wte"]     = model.wte;
        model.tensors["model/wpe"]     = model.wpe;
        model.tensors["model/lm_head"] = model.lm_head;
        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];
@ -369,16 +238,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            // map by name
@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
        }
    }
@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    {
        size_t total_size = 0;
        bool has_lm_head = false;
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;
            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
            if (fin.eof()) {
                break;
@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                return false;
            }
-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            // for debugging
            if (0) {
                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
            }
-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            // GPT-2 models share the WTE tensor as the LM head
            if (name == "model/wte" && has_lm_head == false) {
                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
            }
            if (name == "model/lm_head") {
                has_lm_head = true;
            }
            total_size += ggml_nbytes(tensor);
        }
@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted probabilities of the next token
+//   - embd_w:    the predicted logits for the next token
 //
 bool gpt2_eval(
        const gpt2_model & model,
@ -512,12 +396,12 @@ bool gpt2_eval(
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;
-    static size_t buf_size = 640u*1024*1024;
+    static size_t buf_size = 512u*1024*1024;
    static void * buf = malloc(buf_size);
    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
        // reallocate
        buf_size = buf_size_new;
@ -528,13 +412,14 @@ bool gpt2_eval(
        }
    }
-    struct ggml_init_params params;
+    struct ggml_init_params params = {
-    params.mem_size   = buf_size;
+        /*.mem_size   =*/ buf_size,
-    params.mem_buffer = buf;
+        /*.mem_buffer =*/ buf,
        /*.no_alloc   =*/ false,
    };
    struct ggml_context * ctx0 = ggml_init(params);
-
+    struct ggml_cgraph gf = {};
    struct ggml_cgraph gf = { };
    gf.n_threads = n_threads;
    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@ -578,7 +463,7 @@ bool gpt2_eval(
        // [2304, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
+                    model.layers[il].c_attn_attn_w,
                    cur);
            cur = ggml_add(ctx0,
@ -654,11 +539,13 @@ bool gpt2_eval(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            // [n_past + N, 64, 12]
            struct ggml_tensor * V_trans =
                ggml_cpy(ctx0,
                        ggml_permute(ctx0,
                            ggml_reshape_3d(ctx0,
                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                            1, 2, 0, 3),
                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
            // KQV = transpose(V) * KQ_soft_max
            // [64, N, 12]
@ -685,7 +572,7 @@ bool gpt2_eval(
        // [768, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    model.layers[il].c_attn_proj_w,
                    cur);
            cur = ggml_add(ctx0,
@ -722,7 +609,7 @@ bool gpt2_eval(
            // cur = fc_w*cur + fc_b
            // [3072, N]
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    model.layers[il].c_mlp_fc_w,
                    cur);
            cur = ggml_add(ctx0,
@ -742,7 +629,7 @@ bool gpt2_eval(
            // cur = proj_w*cur + proj_b
            // [768, N]
            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w_trans,
+                    model.layers[il].c_mlp_proj_w,
                    cur);
            cur = ggml_add(ctx0,
@ -769,12 +656,12 @@ bool gpt2_eval(
    }
    // inpL = WTE * inpL
-    // [ 768, 50257] - model.wte
+    // [ 768, 50257] - model.lm_head
    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.wte, inpL);
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
    // logits -> probs
-    inpL = ggml_soft_max(ctx0, inpL);
+    //inpL = ggml_soft_max(ctx0, inpL);
    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@ -788,7 +675,7 @@ bool gpt2_eval(
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-    // return result for just the last token
+    // return result just for the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
@ -825,7 +712,7 @@ Me too.
    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
    // sampling parameters
-    int32_t top_k = 40;
+    int32_t top_k = 5;
    float   top_p = 0.9f;
    float   temp  = 1.0f;
 };
@ -833,14 +720,14 @@ Me too.
 struct gpt2_context * gpt2_init(const char * path_model) {
    gpt2_context * ctx = new gpt2_context;
-    ctx->rng = std::mt19937(time(NULL));
+    ctx->rng = std::mt19937(time(nullptr));
    // load the model
    {
        const int64_t t_start_us = ggml_time_us();
        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
            delete ctx;
            return nullptr;
        }
@ -885,9 +772,9 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
    std::string result;
-    for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
+    for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
        // predict
-        if (embd.size() > 0) {
+        if (!embd.empty()) {
            if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
                printf("gpt-2: failed to generate text\n");
                return "";
@ -914,10 +801,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
        result += ctx->vocab.id_to_token[embd[0]];
        // end of text token
-        if (embd.back() == 50256 ||
+        if (embd.back() == 50256) {
            ctx->vocab.id_to_token[embd.back()] == "." ||
            ctx->vocab.id_to_token[embd.back()] == "!" ||
            ctx->vocab.id_to_token[embd.back()] == "?") {
            break;
        }
    }
--- a/examples/talk.wasm/gpt-2.h
+++ b/examples/talk.wasm/gpt-2.h
@ -2,18 +2,12 @@
 // TODO: Change to C-style API and move to ./examples for easy reuse.
 #include "common.h"
 #include <vector>
 #include <map>
 #include <string>
 struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;
    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
 };
 struct gpt2_context;
 struct gpt2_context * gpt2_init(const char * path_model);
--- a/examples/talk.wasm/index-tmpl.html
+++ b/examples/talk.wasm/index-tmpl.html
@ -44,6 +44,15 @@
            <br><br>
            <b>More examples:</b>
                <a href="https://whisper.ggerganov.com/">main</a> |
                <a href="https://whisper.ggerganov.com/bench">bench</a> |
                <a href="https://whisper.ggerganov.com/stream">stream</a> |
                <a href="https://whisper.ggerganov.com/command">command</a> |
                <a href="https://whisper.ggerganov.com/talk">talk</a> |
            <br><br>
            <hr>
            Select the models you would like to use and click the "Start" button to begin the conversation
@ -54,6 +63,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <!--
@ -266,11 +279,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };
                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
                    'tiny-en-q5_1':   31,
                    'base-en-q5_1':   57,
                };
                let url     = urls[model];
@ -281,6 +300,10 @@
                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
                cbProgress = function(p) {
@ -292,6 +315,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };
--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@ -1,16 +1,8 @@
 if (WHISPER_SDL2)
    # talk
    set(TARGET talk)
-    #add_executable(${TARGET} talk.cpp gpt-2.cpp)
+    add_executable(${TARGET} talk.cpp gpt-2.cpp)
-    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
    #target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
    # TODO: this is temporary
    #       need to export ggml symbols for MSVC, but too lazy ..
    add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
    include(DefaultTargetOptions)
    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -1,4 +1,6 @@
 #include "ggml.h"
 #include "common-ggml.h"
 #include "gpt-2.h"
 #include <cmath>
@ -14,150 +16,6 @@
 /////////////////////// GPT-2 BEGIN /////////////////////////
 //
 // Vocab utils
 //
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
    std::vector<std::string> words;
    // first split the text into words
    {
        std::string str = text;
        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
        std::regex re(pat);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            for (auto x : m) {
                words.push_back(x);
            }
            str = m.suffix();
        }
    }
    // find the longest tokens that form the words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
        if (word.empty()) continue;
        int i = 0;
        int n = word.size();
        while (i < n) {
            int j = n;
            while (j > i) {
                auto it = vocab.token_to_id.find(word.substr(i, j-i));
                if (it != vocab.token_to_id.end()) {
                    tokens.push_back(it->second);
                    i = j;
                    break;
                }
                --j;
            }
            if (i == n) {
                break;
            }
            if (j == i) {
                auto sub = word.substr(i, 1);
                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
                    tokens.push_back(vocab.token_to_id.at(sub));
                } else {
                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
                }
                ++i;
            }
        }
    }
    return tokens;
 }
 gpt_vocab::id gpt_sample_top_k_top_p(
        const gpt_vocab & vocab,
        const float * logits,
        int    top_k,
        double top_p,
        double /*temp*/,
        std::mt19937 & rng) {
    int n_logits = vocab.id_to_token.size();
    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);
    for (int i = 0; i < n_logits; i++) {
        logits_id.emplace_back(logits[i], i);
    }
    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });
    logits_id.resize(top_k);
    // normalize
    {
        double sum = 0.0f;
        for (int i = 0; i < (int)logits_id.size(); i++) {
            sum += logits_id[i].first;
        }
        sum = 1.0/sum;
        for (int i = 0; i < (int)logits_id.size(); i++) {
            logits_id[i].first *= sum;
        }
    }
    if (top_p < 1.0f) {
        {
            double cumsum = 0.0f;
            for (int i = 0; i < top_k; i++) {
                cumsum += logits_id[i].first;
                if (cumsum >= top_p) {
                    logits_id.resize(i+1);
                    break;
                }
            }
        }
        // normalize again
        {
            double sum = 0.0f;
            for (int i = 0; i < (int)logits_id.size(); i++) {
                sum += logits_id[i].first;
            }
            sum = 1.0/sum;
            for (int i = 0; i < (int)logits_id.size(); i++) {
                logits_id[i].first *= sum;
            }
        }
    }
    //printf("\n");
    //for (int i = 0; i < (int) logits_id.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
    //}
    //exit(0);
    // sample from the obtained distribution
    std::vector<double> probs;
    probs.reserve(logits_id.size());
    for (int i = 0; i < (int) logits_id.size(); i++) {
        probs.push_back(logits_id[i].first);
    }
    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);
    return logits_id[idx].second;
 }
 // default hparams (GPT-2 117M)
 struct gpt2_hparams {
    int32_t n_vocab = 50257;
@ -165,7 +23,7 @@ struct gpt2_hparams {
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };
 struct gpt2_layer {
@ -187,7 +45,7 @@ struct gpt2_layer {
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;
-    struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
+    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
 };
@ -200,6 +58,7 @@ struct gpt2_model {
    struct ggml_tensor * wte;     // position embedding
    struct ggml_tensor * wpe;     //    token embedding
    struct ggml_tensor * lm_head; // language model head
    std::vector<gpt2_layer> layers;
@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    }
    // load vocab
@ -268,16 +127,21 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            fin.read((char *) &len, sizeof(len));
            word.resize(len);
-            fin.read((char *) &word[0], len);
+            fin.read((char *) word.data(), len);
            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
        }
    }
-    // for the big tensors, we have the option to store the data in 16-bit floats
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
    if (wtype == GGML_TYPE_COUNT) {
        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
                __func__, fname.c_str(), model.hparams.ftype);
        return false;
    }
    auto & ctx = model.ctx;
@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
-        ctx_size += n_vocab*n_embd*ggml_type_size(wtype);         // wte
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_size(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
        ctx_size += (6 + 12*n_layer)*256; // object overhead
@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    // create the ggml context
    {
-        struct ggml_init_params params;
+        struct ggml_init_params params = {
-        params.mem_size   = ctx_size;
+            .mem_size   = ctx_size,
-        params.mem_buffer = nullptr;
+            .mem_buffer = NULL,
            .no_alloc   = false,
        };
        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -352,6 +219,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
        // map by name
        model.tensors["model/ln_f/g"] = model.ln_f_g;
@ -359,6 +227,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.tensors["model/wte"]     = model.wte;
        model.tensors["model/wpe"]     = model.wpe;
        model.tensors["model/lm_head"] = model.lm_head;
        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];
@ -369,16 +238,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
            // map by name
@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;
-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
        }
    }
@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    {
        size_t total_size = 0;
        bool has_lm_head = false;
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;
            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
            if (fin.eof()) {
                break;
@ -448,7 +319,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            std::string name(length, 0);
            fin.read(&name[0], length);
-            if (model.tensors.find(name) == model.tensors.end()) {
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
                return false;
            }
@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                return false;
            }
-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            // for debugging
            if (0) {
                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
            }
-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
-            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            // GPT-2 models share the WTE tensor as the LM head
            if (name == "model/wte" && has_lm_head == false) {
                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
            }
            if (name == "model/lm_head") {
                has_lm_head = true;
            }
            total_size += ggml_nbytes(tensor);
        }
@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted probabilities of the next token
+//   - embd_w:    the predicted logits for the next token
 //
 bool gpt2_eval(
        const gpt2_model & model,
@ -512,12 +396,12 @@ bool gpt2_eval(
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;
-    static size_t buf_size = 5640ull*1024*1024;
+    static size_t buf_size = 512u*1024*1024;
    static void * buf = malloc(buf_size);
    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
        // reallocate
        buf_size = buf_size_new;
@ -528,13 +412,14 @@ bool gpt2_eval(
        }
    }
-    struct ggml_init_params params;
+    struct ggml_init_params params = {
-    params.mem_size   = buf_size;
+        /*.mem_size   =*/ buf_size,
-    params.mem_buffer = buf;
+        /*.mem_buffer =*/ buf,
        /*.no_alloc   =*/ false,
    };
    struct ggml_context * ctx0 = ggml_init(params);
-
+    struct ggml_cgraph gf = {};
    struct ggml_cgraph gf = { };
    gf.n_threads = n_threads;
    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@ -578,7 +463,7 @@ bool gpt2_eval(
        // [2304, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
+                    model.layers[il].c_attn_attn_w,
                    cur);
            cur = ggml_add(ctx0,
@ -654,11 +539,13 @@ bool gpt2_eval(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            // [n_past + N, 64, 12]
            struct ggml_tensor * V_trans =
                ggml_cpy(ctx0,
                        ggml_permute(ctx0,
                            ggml_reshape_3d(ctx0,
                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                            1, 2, 0, 3),
                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
            // KQV = transpose(V) * KQ_soft_max
            // [64, N, 12]
@ -685,7 +572,7 @@ bool gpt2_eval(
        // [768, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    model.layers[il].c_attn_proj_w,
                    cur);
            cur = ggml_add(ctx0,
@ -722,7 +609,7 @@ bool gpt2_eval(
            // cur = fc_w*cur + fc_b
            // [3072, N]
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    model.layers[il].c_mlp_fc_w,
                    cur);
            cur = ggml_add(ctx0,
@ -742,7 +629,7 @@ bool gpt2_eval(
            // cur = proj_w*cur + proj_b
            // [768, N]
            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w_trans,
+                    model.layers[il].c_mlp_proj_w,
                    cur);
            cur = ggml_add(ctx0,
@ -769,12 +656,12 @@ bool gpt2_eval(
    }
    // inpL = WTE * inpL
-    // [ 768, 50257] - model.wte
+    // [ 768, 50257] - model.lm_head
    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.wte, inpL);
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
    // logits -> probs
-    inpL = ggml_soft_max(ctx0, inpL);
+    //inpL = ggml_soft_max(ctx0, inpL);
    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@ -788,7 +675,7 @@ bool gpt2_eval(
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
-    // return result for just the last token
+    // return result just for the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
--- a/examples/talk/gpt-2.h
+++ b/examples/talk/gpt-2.h
@ -2,18 +2,12 @@
 // TODO: Change to C-style API and move to ./examples for easy reuse.
 #include "common.h"
 #include <vector>
 #include <map>
 #include <string>
 struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;
    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
 };
 struct gpt2_context;
 struct gpt2_context * gpt2_init(const char * path_model);
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -31,9 +31,9 @@ endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
+    -s PTHREAD_POOL_SIZE_STRICT=0 \
-    -s INITIAL_MEMORY=1500MB \
+    -s INITIAL_MEMORY=2000MB \
-    -s TOTAL_MEMORY=1500MB \
+    -s TOTAL_MEMORY=2000MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/whisper.wasm/emscripten.cpp
+++ b/examples/whisper.wasm/emscripten.cpp
@ -10,6 +10,12 @@ std::thread g_worker;
 std::vector<struct whisper_context *> g_contexts(4, nullptr);
 static inline int mpow2(int n) {
    int p = 1;
    while (p <= n) p *= 2;
    return p/2;
 }
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        if (g_worker.joinable()) {
@ -43,7 +49,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        }
    }));
-    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
+    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, int nthreads, bool translate) {
        if (g_worker.joinable()) {
            g_worker.join();
        }
@ -66,7 +72,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        params.print_special    = false;
        params.translate        = translate;
        params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
-        params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
+        params.n_threads        = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency())));
        params.offset_ms        = 0;
        std::vector<float> pcmf32;
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -40,21 +40,42 @@
            Note that the computation is quite heavy and may take a few seconds to complete.<br>
            The transcription results will be displayed in the text area below.<br><br>
-            <b>Important: your browser must support WASM SIMD instructions for this to work.</b>
+            <b>Important:</b>
                <ul>
                    <li>your browser must support WASM SIMD instructions for this to work</li>
                    <li>Firefox cannot load files larger than 256 MB - use Chrome instead</li>
                </ul>
-            <br><br><hr>
+            <b>More examples:</b>
                <a href="https://whisper.ggerganov.com/">main</a> |
                <a href="https://whisper.ggerganov.com/bench">bench</a> |
                <a href="https://whisper.ggerganov.com/stream">stream</a> |
                <a href="https://whisper.ggerganov.com/command">command</a> |
                <a href="https://whisper.ggerganov.com/talk">talk</a> |
            <hr>
            <div id="model">
-                Whisper model: <span id="model-whisper-status"></span>
+                Whisper models: <span id="model-whisper-status"></span><br><br>
                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-tiny"     onclick="loadWhisper('tiny')">tiny (75 MB)</button>
                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <button id="fetch-whisper-base"     onclick="loadWhisper('base')">base (142 MB)</button>
                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                <button id="fetch-whisper-small"    onclick="loadWhisper('small')">small (466 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-tiny-q5_1"      onclick="loadWhisper('tiny-q5_1')">tiny (Q5_1, 31 MB)</button>
                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <button id="fetch-whisper-base-q5_1"      onclick="loadWhisper('base-q5_1')">base (Q5_1, 57 MB)</button>
                <button id="fetch-whisper-small-en-q5_1"  onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
                <button id="fetch-whisper-small-q5_1"     onclick="loadWhisper('small-q5_1')">small (Q5_1, 182 MB)</button><br>
                <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
                <button id="fetch-whisper-medium-q5_0"    onclick="loadWhisper('medium-q5_0')">medium (Q5_0, 515 MB)</button>
                <button id="fetch-whisper-large-q5_0"     onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
                <span id="fetch-whisper-progress"></span>
            </div>
            <br>
@ -161,6 +182,12 @@
                            <option value="yi">Yiddish</option>
                        </select>
                    </td>
                    <!-- Slider to select number of threads between 1 and 16 -->
                    <td>
                        Threads:
                        <input type="range" id="threads" name="threads" min="1" max="16" value="8" onchange="changeThreads(this.value)" />
                        <span id="threads-value">8</span>
                    </td>
                    <td>
                        <button onclick="onProcess(false);">Transcribe</button>
                    </td>
@ -263,11 +290,13 @@
                Module.FS_createDataFile("/", fname, buf, true, true);
-                model_whisper = fname;
+                //model_whisper = fname;
                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
                document.getElementById('model').innerHTML = 'Model fetched: ' + model_whisper;
            }
            function loadFile(event, fname) {
@ -292,6 +321,17 @@
                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-tiny-q5_1'     ).style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-base-q5_1'     ).style.display = 'none';
                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
                document.getElementById('fetch-whisper-small-q5_1'    ).style.display = 'none';
                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
                document.getElementById('fetch-whisper-medium-q5_0'   ).style.display = 'none';
                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
                document.getElementById('whisper-file'          ).style.display = 'none';
                document.getElementById('model-whisper-status'  ).innerHTML = 'loaded model: ' + file.name;
            }
@ -304,6 +344,16 @@
                    'base':     'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
                    'small':    'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
                    'tiny-q5_1':     'https://whisper.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin',
                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                    'base-q5_1':     'https://whisper.ggerganov.com/ggml-model-whisper-base-q5_1.bin',
                    'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
                    'small-q5_1':    'https://whisper.ggerganov.com/ggml-model-whisper-small-q5_1.bin',
                    'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
                    'medium-q5_0':   'https://whisper.ggerganov.com/ggml-model-whisper-medium-q5_0.bin',
                    'large-q5_0':    'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
                };
                let sizes = {
@ -313,6 +363,16 @@
                    'base':     142,
                    'small.en': 466,
                    'small':    466,
                    'tiny-en-q5_1':   31,
                    'tiny-q5_1':      31,
                    'base-en-q5_1':   57,
                    'base-q5_1':      57,
                    'small-en-q5_1':  182,
                    'small-q5_1':     182,
                    'medium-en-q5_0': 515,
                    'medium-q5_0':    515,
                    'large-q5_0':     1030,
                };
                let url     = urls[model];
@ -327,8 +387,19 @@
                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-tiny-q5_1'     ).style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
                document.getElementById('fetch-whisper-base-q5_1'     ).style.display = 'none';
                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
                document.getElementById('fetch-whisper-small-q5_1'    ).style.display = 'none';
                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
                document.getElementById('fetch-whisper-medium-q5_0'   ).style.display = 'none';
                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
                document.getElementById('whisper-file'        ).style.display = 'none';
-                document.getElementById('model-whisper-status'  ).innerHTML = 'loading model: ' + model;
+                document.getElementById('model-whisper-status').innerHTML = 'loading model: ' + model;
                cbProgress = function(p) {
                    let el = document.getElementById('fetch-whisper-progress');
@ -337,14 +408,26 @@
                cbCancel = function() {
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small'   ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'  ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny-q5_1'     ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q5_1'  ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-q5_1'     ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-q5_1'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-medium-q5_0'   ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-large-q5_0'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('whisper-file'        ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('model-whisper-status'  ); if (el) el.innerHTML = '';
+                    el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
                };
                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
@ -354,7 +437,8 @@
            // audio file
            //
-            const kMaxAudio_s = 120;
+            const kMaxAudio_s = 30*60;
            const kMaxRecording_s = 2*60;
            const kSampleRate = 16000;
            window.AudioContext = window.AudioContext || window.webkitAudioContext;
@ -423,7 +507,7 @@
                doRecording = false;
            }
-            // record up to kMaxAudio_s seconds of audio from the microphone
+            // record up to kMaxRecording_s seconds of audio from the microphone
            // check if doRecording is false every 1000 ms and stop recording if so
            // update progress information
            function startRecording() {
@ -479,9 +563,9 @@
                                        printTextarea('js: audio recorded, size: ' + audio.length);
                                        // truncate to first 30 seconds
-                                        if (audio.length > kMaxAudio_s*kSampleRate) {
+                                        if (audio.length > kMaxRecording_s*kSampleRate) {
-                                            audio = audio.slice(0, kMaxAudio_s*kSampleRate);
+                                            audio = audio.slice(0, kMaxRecording_s*kSampleRate);
-                                            printTextarea('js: truncated audio to first ' + kMaxAudio_s + ' seconds');
+                                            printTextarea('js: truncated audio to first ' + kMaxRecording_s + ' seconds');
                                        }
                                        setAudio(audio);
                                    });
@ -509,24 +593,31 @@
                        });
                    }
-                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxAudio_s) + '%';
+                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxRecording_s) + '%';
-                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxAudio_s).toFixed(0) + '%';
+                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxRecording_s).toFixed(0) + '%';
                }, 1000);
                printTextarea('js: recording ...');
                setTimeout(function() {
                    if (doRecording) {
-                        printTextarea('js: recording stopped after ' + kMaxAudio_s + ' seconds');
+                        printTextarea('js: recording stopped after ' + kMaxRecording_s + ' seconds');
                        stopRecording();
                    }
-                }, kMaxAudio_s*1000);
+                }, kMaxRecording_s*1000);
            }
            //
            // transcribe
            //
            var nthreads = 8;
            function changeThreads(value) {
                nthreads = value;
                document.getElementById('threads-value').innerHTML = nthreads;
            }
            function onProcess(translate) {
                if (!instance) {
                    instance = Module.init('whisper.bin');
@ -553,7 +644,7 @@
                    printTextarea('');
                    setTimeout(function() {
-                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, translate);
+                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, nthreads, translate);
                        console.log('js: full_default returned: ' + ret);
                        if (ret) {
                            printTextarea("js: whisper returned: " + ret);
--- a/extra/quantize-all.sh
+++ b/extra/quantize-all.sh
@ -0,0 +1,45 @@
 #!/bin/bash
 printf "Usage: $0 <upload>"
 if [ $# -ne 1 ]; then
    printf "\nError: Invalid number of arguments\n"
    exit 1
 fi
 qtype0="q5_0"
 qtype1="q5_1"
 upload="$1"
 cd `dirname $0`
 cd ../
 ./quantize ./models/ggml-tiny.en.bin   ./models/ggml-tiny.en-${qtype1}.bin ${qtype1}
 ./quantize ./models/ggml-tiny.bin      ./models/ggml-tiny-${qtype1}.bin    ${qtype1}
 ./quantize ./models/ggml-base.en.bin   ./models/ggml-base.en-${qtype1}.bin ${qtype1}
 ./quantize ./models/ggml-base.bin      ./models/ggml-base-${qtype1}.bin    ${qtype1}
 ./quantize ./models/ggml-small.en.bin  ./models/ggml-small.en-${qtype1}.bin ${qtype1}
 ./quantize ./models/ggml-small.bin     ./models/ggml-small-${qtype1}.bin    ${qtype1}
 ./quantize ./models/ggml-medium.en.bin ./models/ggml-medium.en-${qtype0}.bin ${qtype0}
 ./quantize ./models/ggml-medium.bin    ./models/ggml-medium-${qtype0}.bin    ${qtype0}
 ./quantize ./models/ggml-large.bin     ./models/ggml-large-${qtype0}.bin ${qtype0}
 if [ "$upload" == "1" ]; then
    scp ./models/ggml-tiny.en-${qtype1}.bin   root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny.en-${qtype1}.bin
    scp ./models/ggml-tiny-${qtype1}.bin      root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny-${qtype1}.bin
    scp ./models/ggml-base.en-${qtype1}.bin   root@linode0:/mnt/Data/ggml/ggml-model-whisper-base.en-${qtype1}.bin
    scp ./models/ggml-base-${qtype1}.bin      root@linode0:/mnt/Data/ggml/ggml-model-whisper-base-${qtype1}.bin
    scp ./models/ggml-small.en-${qtype1}.bin  root@linode0:/mnt/Data/ggml/ggml-model-whisper-small.en-${qtype1}.bin
    scp ./models/ggml-small-${qtype1}.bin     root@linode0:/mnt/Data/ggml/ggml-model-whisper-small-${qtype1}.bin
    scp ./models/ggml-medium.en-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium.en-${qtype0}.bin
    scp ./models/ggml-medium-${qtype0}.bin    root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium-${qtype0}.bin
    scp ./models/ggml-large-${qtype0}.bin     root@linode0:/mnt/Data/ggml/ggml-model-whisper-large-${qtype0}.bin
 fi
--- a/extra/sync-ggml.sh
+++ b/extra/sync-ggml.sh
@ -4,3 +4,7 @@ cp -rpv ../ggml/src/ggml.c          ./ggml.c
 cp -rpv ../ggml/src/ggml-cuda.cu         ./ggml-cuda.cu
 cp -rpv ../ggml/src/ggml-cuda.h          ./ggml-cuda.h
 cp -rpv ../ggml/include/ggml/ggml.h      ./ggml.h
 cp -rpv ../ggml/examples/common.h        ./examples/common.h
 cp -rpv ../ggml/examples/common.cpp      ./examples/common.cpp
 cp -rpv ../ggml/examples/common-ggml.h   ./examples/common-ggml.h
 cp -rpv ../ggml/examples/common-ggml.cpp ./examples/common-ggml.cpp
--- a/ggml.c
+++ b/ggml.c
@ -330,7 +330,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
 // precomputed f32 table for f16 (256 KB)
 static float table_f32_f16[1 << 16];
-#if defined(__ARM_NEON)
+#if defined(__ARM_NEON) || defined(__wasm_simd128__)
 #define B1(c,s,n)  0x ## n ## c ,  0x ## n ## s
 #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
 #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
@ -3180,6 +3180,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
    }
    *s = vaddvq_f32(sumv);
 #elif defined(__wasm_simd128__)
    v128_t sumv = wasm_f32x4_splat(0.0f);
    uint64_t tmp[4];
    for (int i = 0; i < nb; ++i) {
        const block_q5_0 * restrict x0 = &x[i];
        const block_q8_0 * restrict y0 = &y[i];
        const v128_t m4b  = wasm_i8x16_splat(0x0F);
        const v128_t s16b = wasm_i8x16_splat(0x10);
        // extract the 5th bit
        uint32_t qh;
        memcpy(&qh, x0->qh, sizeof(qh));
        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
        tmp[3] = table_b2b_u[(qh >> 24)       ];
        const v128_t qhl = wasm_v128_load(tmp + 0);
        const v128_t qhh = wasm_v128_load(tmp + 2);
        const v128_t v0 = wasm_v128_load(x0->qs);
        // 4-bit -> 8-bit
        const v128_t v0l = wasm_v128_and (v0, m4b);
        const v128_t v0h = wasm_u8x16_shr(v0, 4);
        // interleave
        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
        // add high bit and sub 16
        const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
        const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
        // load y
        const v128_t v1l = wasm_v128_load(y0->qs);
        const v128_t v1h = wasm_v128_load(y0->qs + 16);
        // int8x16 -> int16x8
        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
        const float x0d = GGML_FP16_TO_FP32(x0->d);
        // dot product
        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
                        wasm_i32x4_add(
                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
    }
    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
 #elif defined(__AVX2__)
    // Initialize accumulator with zeros
    __m256 acc = _mm256_setzero_ps();
@ -3311,6 +3377,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
    }
    *s = vaddvq_f32(sumv) + summs;
 #elif defined(__wasm_simd128__)
    v128_t sumv = wasm_f32x4_splat(0.0f);
    float summs = 0.0f;
    uint64_t tmp[4];
    for (int i = 0; i < nb; ++i) {
        const block_q5_1 * restrict x0 = &x[i];
        const block_q8_1 * restrict y0 = &y[i];
        summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
        const v128_t m4b = wasm_i8x16_splat(0x0F);
        // extract the 5th bit
        uint32_t qh;
        memcpy(&qh, x0->qh, sizeof(qh));
        tmp[0] = table_b2b_u[(qh >>  0) & 0xFF];
        tmp[1] = table_b2b_u[(qh >>  8) & 0xFF];
        tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
        tmp[3] = table_b2b_u[(qh >> 24)       ];
        const v128_t qhl = wasm_v128_load(tmp + 0);
        const v128_t qhh = wasm_v128_load(tmp + 2);
        const v128_t v0 = wasm_v128_load(x0->qs);
        // 4-bit -> 8-bit
        const v128_t v0l = wasm_v128_and (v0, m4b);
        const v128_t v0h = wasm_u8x16_shr(v0, 4);
        static bool x = true;
        // interleave
        const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h,  0, 16,  1, 17,  2, 18,  3, 19,  4, 20,  5, 21,  6, 22,  7, 23);
        const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h,  8, 24,  9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
        // add high bit
        const v128_t v0lf = wasm_v128_or(v0lz, qhl);
        const v128_t v0hf = wasm_v128_or(v0hz, qhh);
        // load y
        const v128_t v1l = wasm_v128_load(y0->qs);
        const v128_t v1h = wasm_v128_load(y0->qs + 16);
        // int8x16 -> int16x8
        const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
        const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
        const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
        const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
        const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
        const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
        const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
        const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
        const float x0d = GGML_FP16_TO_FP32(x0->d);
        // dot product
        sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
                        wasm_i32x4_add(
                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
                                           wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
                            wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
                                           wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
    }
    *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
         wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
 #elif defined(__AVX2__)
    // Initialize accumulator with zeros
    __m256 acc = _mm256_setzero_ps();
@ -3827,6 +3964,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
    "DIAG_MASK_INF",
    "SOFT_MAX",
    "ROPE",
    "ALIBI",
    "CONV_1D_1S",
    "CONV_1D_2S",
@ -3875,6 +4013,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "diag_mask_inf(x)",
    "soft_max(x)",
    "rope(x)",
    "alibi(x)",
    "conv_1d_1s(x)",
    "conv_1d_2s(x)",
@ -4055,6 +4194,27 @@ bool ggml_is_quantized(enum ggml_type type) {
    return GGML_IS_QUANTIZED[type];
 }
 enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
    enum ggml_type wtype = GGML_TYPE_COUNT;
    switch (ftype) {
        case GGML_FTYPE_ALL_F32:              wtype = GGML_TYPE_F32;   break;
        case GGML_FTYPE_MOSTLY_F16:           wtype = GGML_TYPE_F16;   break;
        case GGML_FTYPE_MOSTLY_Q4_0:          wtype = GGML_TYPE_Q4_0;  break;
        case GGML_FTYPE_MOSTLY_Q4_1:          wtype = GGML_TYPE_Q4_1;  break;
        case GGML_FTYPE_MOSTLY_Q4_2:          wtype = GGML_TYPE_Q4_2;  break;
        case GGML_FTYPE_MOSTLY_Q5_0:          wtype = GGML_TYPE_Q5_0;  break;
        case GGML_FTYPE_MOSTLY_Q5_1:          wtype = GGML_TYPE_Q5_1;  break;
        case GGML_FTYPE_MOSTLY_Q8_0:          wtype = GGML_TYPE_Q8_0;  break;
        case GGML_FTYPE_UNKNOWN:              wtype = GGML_TYPE_COUNT; break;
        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
    }
    GGML_ASSERT(wtype != GGML_TYPE_COUNT);
    return wtype;
 }
 static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
    return tensor->nb[0] > tensor->nb[1];
 }
--- a/ggml.h
+++ b/ggml.h
@ -232,6 +232,20 @@ extern "C" {
        GGML_TYPE_COUNT,
    };
    // model file types
    enum ggml_ftype {
        GGML_FTYPE_UNKNOWN     = -1,
        GGML_FTYPE_ALL_F32     = 0,
        GGML_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
        GGML_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
        GGML_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
    };
    // available tensor operations:
    enum ggml_op {
        GGML_OP_NONE = 0,
@ -385,6 +399,8 @@ extern "C" {
    GGML_API bool    ggml_is_quantized(enum ggml_type type);
    GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
    // main
    GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
--- a/whisper.cpp
+++ b/whisper.cpp
@ -1,4 +1,3 @@
 #define WHISPER_BUILD
 #include "whisper.h"
 #if WHISPER_USE_COREML
 #include "coreml/whisper-encoder.h"
@ -255,12 +254,70 @@ static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
    { MODEL_LARGE,     9ull*MB },
 };
-static const std::map<e_model, size_t> MEM_REQ_MODEL = {
+static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
    { GGML_TYPE_F32,
        {
            { MODEL_TINY,     74ull*MB },
            { MODEL_BASE,    142ull*MB },
            { MODEL_SMALL,   466ull*MB },
            { MODEL_MEDIUM, 1464ull*MB },
            { MODEL_LARGE,  2952ull*MB },
        },
    },
    { GGML_TYPE_F16,
        {
            { MODEL_TINY,     74ull*MB },
            { MODEL_BASE,    142ull*MB },
            { MODEL_SMALL,   466ull*MB },
            { MODEL_MEDIUM, 1464ull*MB },
            { MODEL_LARGE,  2952ull*MB },
        },
    },
    { GGML_TYPE_Q4_0,
        {
            { MODEL_TINY,     26ull*MB },
            { MODEL_BASE,     50ull*MB },
            { MODEL_SMALL,   154ull*MB },
            { MODEL_MEDIUM,  470ull*MB },
            { MODEL_LARGE,   940ull*MB },
        },
    },
    { GGML_TYPE_Q4_1,
        {
            { MODEL_TINY,     31ull*MB },
            { MODEL_BASE,     57ull*MB },
            { MODEL_SMALL,   181ull*MB },
            { MODEL_MEDIUM,  559ull*MB },
            { MODEL_LARGE,  1122ull*MB },
        },
    },
    { GGML_TYPE_Q4_2,
        {
            { MODEL_TINY,     26ull*MB },
            { MODEL_BASE,     50ull*MB },
            { MODEL_SMALL,   154ull*MB },
            { MODEL_MEDIUM,  470ull*MB },
            { MODEL_LARGE,   940ull*MB },
        },
    },
    { GGML_TYPE_Q5_0, // TODO: fix
        {
            { MODEL_TINY,     31ull*MB },
            { MODEL_BASE,     57ull*MB },
            { MODEL_SMALL,   181ull*MB },
            { MODEL_MEDIUM,  559ull*MB },
            { MODEL_LARGE,  1122ull*MB },
        },
    },
    { GGML_TYPE_Q5_1,
        {
            { MODEL_TINY,     31ull*MB },
            { MODEL_BASE,     57ull*MB },
            { MODEL_SMALL,   181ull*MB },
            { MODEL_MEDIUM,  559ull*MB },
            { MODEL_LARGE,  1122ull*MB },
        },
    },
 };
 static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
@ -370,7 +427,7 @@ struct whisper_hparams {
    int32_t n_text_head   = 6;
    int32_t n_text_layer  = 4;
    int32_t n_mels        = 80;
-    int32_t f16           = 1;
+    int32_t ftype         = 1;
 };
 // audio encoding layer
@ -640,7 +697,8 @@ struct whisper_context {
    int64_t t_load_us  = 0;
    int64_t t_start_us = 0;
-    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16)
+    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
    ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
    whisper_model model;
    whisper_vocab vocab;
@ -697,7 +755,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
    const ggml_type wtype = cache.k->type;
    WHISPER_ASSERT(wtype == cache.v->type);
-    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
+    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));
    struct ggml_init_params params = {
        /*.mem_size   =*/ cache.buf.size(),
@ -770,7 +828,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        read_safe(loader, hparams.n_text_head);
        read_safe(loader, hparams.n_text_layer);
        read_safe(loader, hparams.n_mels);
-        read_safe(loader, hparams.f16);
+        read_safe(loader, hparams.ftype);
        assert(hparams.n_text_state == hparams.n_audio_state);
@ -794,11 +852,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
            model.type = e_model::MODEL_LARGE;
        }
-        // for the big tensors, we have the option to store the data in 16-bit floats
+        // for the big tensors, we have the option to store the data in 16-bit floats or quantized
        // in order to save memory and also to speed up the computation
-        wctx.wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+        wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
        if (wctx.wtype == GGML_TYPE_COUNT) {
            fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
            return false;
        }
-        const size_t scale = model.hparams.f16 ? 1 : 2;
+        const size_t scale = model.hparams.ftype ? 1 : 2;
        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
@ -810,18 +872,18 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
-        fprintf(stderr, "%s: f16           = %d\n", __func__, hparams.f16);
+        fprintf(stderr, "%s: ftype         = %d\n", __func__, model.hparams.ftype);
        fprintf(stderr, "%s: type          = %d\n", __func__, model.type);
        // print memory requirements
        {
            // this is the total memory required to run the inference
            const size_t mem_required =
-                     MEM_REQ_SCRATCH0.at (model.type) +
+                     MEM_REQ_SCRATCH0.at(model.type) +
-                     MEM_REQ_SCRATCH1.at (model.type) +
+                     MEM_REQ_SCRATCH1.at(model.type) +
-                     MEM_REQ_SCRATCH2.at (model.type) +
+                     MEM_REQ_SCRATCH2.at(model.type) +
-                     MEM_REQ_SCRATCH3.at (model.type) +
+                     MEM_REQ_SCRATCH3.at(model.type) +
-                scale*MEM_REQ_MODEL.at   (model.type) +
+                scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) +
                scale*MEM_REQ_KV_CROSS.at(model.type) +
                scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
@ -837,7 +899,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        // always have at least one decoder
        wctx.model.buf = new std::vector<uint8_t>();
-        wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(model.type));
+        wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type));
        // we skip initialization of the state until it is needed
        // because it might be that state will always be provided externally.
@ -928,6 +990,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
    size_t ctx_size = 0;
    const ggml_type wtype = wctx.wtype;
    const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
    {
        const auto & hparams = model.hparams;
@ -946,92 +1009,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        // encoder
        {
-            ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe;
+            ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;
-            ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype);         // e_conv_1_w
+            ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype);         // e_conv_1_w
-            ctx_size +=          n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_1_b
+            ctx_size +=          n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b
-            ctx_size += 3*n_audio_state*n_audio_state*ggml_type_size(wtype);         // e_conv_2_w
+            ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype);         // e_conv_2_w
-            ctx_size +=                 n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_2_b
+            ctx_size +=                 n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b
-            ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_w;
+            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
-            ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_b;
+            ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
        }
        // decoder
        {
-            ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe;
+            ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;
-            ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te;
+            ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;
-            ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_w;
+            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
-            ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_b;
+            ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
        }
        // encoder layers
        {
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
-            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype));         // mlp_0_w
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_0_w
-            ctx_size += n_audio_layer*(              4*n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
+            ctx_size += n_audio_layer*(              4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
-            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype));         // mlp_1_w
+            ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // mlp_1_w
-            ctx_size += n_audio_layer*(                n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
+            ctx_size += n_audio_layer*(                n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
-            ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
+            ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_q_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_q_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_k_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_v_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_v_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
-            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype));         // attn_ln_1_w
+            ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype));         // attn_ln_1_w
-            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
+            ctx_size += n_audio_layer*(              n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
        }
        // decoder layers
        {
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
-            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype));         // mlp_0_w
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_0_w
-            ctx_size += n_text_layer*(             4*n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
+            ctx_size += n_text_layer*(             4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
-            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype));         // mlp_1_w
+            ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype));         // mlp_1_w
-            ctx_size += n_text_layer*(               n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
+            ctx_size += n_text_layer*(               n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_q_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_q_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_k_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_v_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_v_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // attn_ln_1_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // attn_ln_1_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
                                                                                                //
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_w
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
-            ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_b
+            ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_q_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_q_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_q_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_k_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_v_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_v_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_v_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b
-            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype));         // cross_attn_ln_1_w
+            ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype));         // cross_attn_ln_1_w
-            ctx_size += n_text_layer*(             n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
+            ctx_size += n_text_layer*(             n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
        }
        ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
@ -1079,10 +1142,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        {
            model.e_pe       = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
-            model.e_conv_1_w = ggml_new_tensor_3d(ctx, wtype,         3, n_mels, n_audio_state);
+            model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype,         3, n_mels, n_audio_state);
            model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
-            model.e_conv_2_w = ggml_new_tensor_3d(ctx, wtype,         3, n_audio_state, n_audio_state);
+            model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype,         3, n_audio_state, n_audio_state);
            model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
            model.e_ln_w     = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
@ -1259,11 +1322,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;
            read_safe(loader, n_dims);
            read_safe(loader, length);
-            read_safe(loader, ftype);
+            read_safe(loader, ttype);
            if (loader->eof(loader->context)) {
                break;
@ -1298,9 +1361,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                return false;
            }
-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@ -1309,7 +1372,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
            loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
            BYTESWAP_TENSOR(tensor);
-            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
            total_size += ggml_nbytes(tensor);
            model.n_loaded++;
        }
@ -1508,14 +1571,14 @@ static bool whisper_encode_internal(
                    ggml_permute(ctx0,
                            ggml_cpy(ctx0,
                                Qcur,
-                                ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
+                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                            0, 2, 1, 3);
                struct ggml_tensor * K =
                    ggml_permute(ctx0,
                            ggml_cpy(ctx0,
                                Kcur,
-                                ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
+                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                            0, 2, 1, 3);
                struct ggml_tensor * V =
@ -1525,7 +1588,7 @@ static bool whisper_encode_internal(
                                    Vcur,
                                    n_state/n_head, n_head, n_ctx),
                                1, 2, 0, 3),
-                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
+                            ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
                struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
 #else
@ -1540,7 +1603,7 @@ static bool whisper_encode_internal(
                    ggml_permute(ctx0,
                            ggml_cpy(ctx0,
                                Kcur,
-                                ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
+                                ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
                            0, 2, 1, 3);
                // K * Q
@ -1561,7 +1624,7 @@ static bool whisper_encode_internal(
                                    Vcur,
                                    n_state/n_head, n_head, n_ctx),
                                1, 2, 0, 3),
-                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
+                            ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
                            );
                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
@ -1619,7 +1682,7 @@ static bool whisper_encode_internal(
                wstate.use_buf(ctx0, 0);
                cur = ggml_flash_ff(ctx0,
-                        ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
+                        ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
                        layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
 #else
                wstate.use_buf(ctx0, 0);
@ -2537,9 +2600,9 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
 struct whisper_state * whisper_init_state(whisper_context * ctx) {
    whisper_state * state = new whisper_state;
-    const size_t scale = ctx->model.hparams.f16 ? 1 : 2;
+    const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
-    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
+    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
        fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
        delete state;
        return nullptr;
@ -2550,7 +2613,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
        fprintf(stderr, "%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
    }
-    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->wtype, ctx->model.hparams.n_audio_ctx)) {
+    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
        fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
        delete state;
        return nullptr;
@ -3049,8 +3112,8 @@ int whisper_model_n_mels(struct whisper_context * ctx) {
    return ctx->model.hparams.n_mels;
 }
-int whisper_model_f16(struct whisper_context * ctx) {
+int whisper_model_ftype(struct whisper_context * ctx) {
-    return ctx->model.hparams.f16;
+    return ctx->model.hparams.ftype;
 }
 int whisper_model_type(struct whisper_context * ctx) {
@ -4825,23 +4888,32 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
    // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
    std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
    // put a bunch of random data in the buffer
    for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
    for (int j = 0; j < (int) sizes.size(); j++) {
        int n_q4_0 = 0;
        int n_q4_1 = 0;
        int n_fp16 = 0;
        int n_fp32 = 0;
        // GFLOPS/s
        double s_q4_0 = 0.0;
        double s_q4_1 = 0.0;
        double s_fp16 = 0.0;
        double s_fp32 = 0.0;
        const size_t N = sizes[j];
-        for (int k = 0; k < 2; ++k) {
+        for (int k = 0; k < 4; ++k) {
-            const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+            const ggml_type wtype =
                k == 0 ? GGML_TYPE_Q4_0 :
                k == 1 ? GGML_TYPE_Q4_1 :
                k == 2 ? GGML_TYPE_F16  :
                         GGML_TYPE_F32;
-            double & s = k == 0 ? s_fp16 : s_fp32;
+            double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32;
-            int    & n = k == 0 ? n_fp16   : n_fp32;
+            int    & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32;
            struct ggml_init_params gparams = {
                /*.mem_size   =*/ buf.size(),
@ -4885,8 +4957,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
            s = ((2.0*N*N*N*n)/tsum)*1e-9;
        }
-        snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
+        snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n",
-            N, N, s_fp16, n_fp16, s_fp32, n_fp32);
+                N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32);
        s += strbuf;
    }
--- a/whisper.h
+++ b/whisper.h
@ -258,7 +258,7 @@ extern "C" {
    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_f16          (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_ftype        (struct whisper_context * ctx);
    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
    // Token logits obtained from the last call to whisper_decode()
		`@ -0,0 +1,3 @@`
							`# quantize`

							Tool for integer quantization of Whisper `ggml` model files