whisper : add integer quantization support (#540)

* whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples
2025-08-18 09:20:48 +02:00 · 2023-04-30 18:51:57 +03:00
parent 5fd1bdd7fc
commit 794b162a46
41 changed files with 3183 additions and 1010 deletions
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -21,10 +21,14 @@ set(TARGET common)
 add_library(${TARGET} STATIC
    common.h
    common.cpp
+    common-ggml.h
+    common-ggml.cpp
    )

 include(DefaultTargetOptions)

+target_link_libraries(${TARGET} PRIVATE whisper)
+
 set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)

 if (WHISPER_SDL2)
@@ -62,6 +66,7 @@ else()
    add_subdirectory(stream)
    add_subdirectory(command)
    add_subdirectory(bench)
+    add_subdirectory(quantize)
    add_subdirectory(talk)
    add_subdirectory(talk-llama)
 endif()
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@@ -14,9 +14,10 @@ const whisperParamsMock = {
 };

 describe("Run whisper.node", () => {
-  test("it should receive a non-empty value", async () => {
-    let result = await whisperAsync(whisperParamsMock);
+    test("it should receive a non-empty value", async () => {
+        let result = await whisperAsync(whisperParamsMock);

-    expect(result.length).toBeGreaterThan(0);
-  });
+        expect(result.length).toBeGreaterThan(0);
+    }, 10000);
 });
+
--- a/examples/bench.wasm/CMakeLists.txt
+++ b/examples/bench.wasm/CMakeLists.txt
@@ -31,9 +31,9 @@ endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1024MB \
-    -s TOTAL_MEMORY=1024MB \
+    -s PTHREAD_POOL_SIZE_STRICT=0 \
+    -s INITIAL_MEMORY=2000MB \
+    -s TOTAL_MEMORY=2000MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/bench.wasm/index-tmpl.html
+++ b/examples/bench.wasm/index-tmpl.html
@@ -35,6 +35,15 @@

            <br><br>

+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <br><br>
+
            <hr>

            Select the model you would like to use and click the "Bench" button.<br>
@@ -44,11 +53,18 @@

            <div id="model-whisper">
                Whisper model: <span id="model-whisper-status"></span>
-                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
-                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
-                <span id="fetch-whisper-progress"></span>
-
+                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
+                <button id="fetch-whisper-small-en-q5_1"  onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
+                <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
+                <button id="fetch-whisper-large-q5_0"     onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
+                <span id="fetch-whisper-progress"></span>
            </div>

            <br>
@@ -160,6 +176,14 @@

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-small-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
+                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
+                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
+
                document.getElementById('whisper-file'         ).style.display = 'none';
                document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
            }
@@ -168,19 +192,42 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
+                    'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
+                    'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
+                    'large-q5_0':    'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
                };

                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
+                    'small.en': 466,
+
+                    'tiny-en-q5_1':   31,
+                    'base-en-q5_1':   57,
+                    'small-en-q5_1':  182,
+                    'medium-en-q5_0': 515,
+                    'large-q5_0':     1030,
                };

                let url     = urls[model];
                let dst     = 'whisper.bin';
                let size_mb = sizes[model];

-                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
-                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+                document.getElementById('fetch-whisper-tiny-en').style.display  = 'none';
+                document.getElementById('fetch-whisper-base-en').style.display  = 'none';
+                document.getElementById('fetch-whisper-small-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
+                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
+                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
+
+                document.getElementById('whisper-file'        ).style.display = 'none';
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';

                cbProgress = function(p) {
@@ -190,9 +237,18 @@

                cbCancel = function() {
                    var el;
-                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
+                    el = document.getElementById('fetch-whisper-tiny-en');  if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en');  if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'  ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'  ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-large-q5_0'    ); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('whisper-file'        ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
                };

                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
--- a/examples/command.wasm/index-tmpl.html
+++ b/examples/command.wasm/index-tmpl.html
@@ -35,6 +35,15 @@

            <br><br>

+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <br><br>
+
            <hr>

            Select the model you would like to use, click the "Start" button and follow the instructions.
@@ -45,6 +54,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>

                <!--
@@ -162,11 +175,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };

                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
+
+                    'tiny-en-q5_1':   31,
+                    'base-en-q5_1':   57,
                };

                let url     = urls[model];
@@ -177,6 +196,10 @@

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
+
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';

                cbProgress = function(p) {
@@ -188,6 +211,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
+
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };

--- a/examples/common-ggml.cpp
+++ b/examples/common-ggml.cpp
@@ -0,0 +1,241 @@
+#include "common-ggml.h"
+
+#include <regex>
+#include <map>
+
+static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
+    {"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
+    {"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
+    {"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
+    {"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
+    {"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
+    {"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
+};
+
+void ggml_print_ftypes(FILE * fp) {
+    for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
+        fprintf(fp, "  type = \"%s\" or %d\n", it->first.c_str(), it->second);
+    }
+}
+
+enum ggml_ftype ggml_parse_ftype(const char * str) {
+    enum ggml_ftype ftype;
+    if (str[0] == 'q') {
+        const auto it = GGML_FTYPE_MAP.find(str);
+        if (it == GGML_FTYPE_MAP.end()) {
+            fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
+            return GGML_FTYPE_UNKNOWN;
+        }
+        ftype = it->second;
+    } else {
+        ftype = (enum ggml_ftype) atoi(str);
+    }
+
+    return ftype;
+}
+
+bool ggml_common_quantize_0(
+        std::ifstream & finp,
+        std::ofstream & fout,
+        const ggml_ftype ftype,
+        const std::vector<std::string> & to_quant,
+        const std::vector<std::string> & to_skip) {
+
+    ggml_type qtype = GGML_TYPE_F32;
+
+    switch (ftype) {
+        case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
+        case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
+        case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
+        case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
+        case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
+        case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
+        case GGML_FTYPE_UNKNOWN:
+        case GGML_FTYPE_ALL_F32:
+        case GGML_FTYPE_MOSTLY_F16:
+        case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
+                {
+                    fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
+                    return false;
+                }
+    };
+
+    if (!ggml_is_quantized(qtype)) {
+        fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
+        return false;
+    }
+
+    size_t total_size_org = 0;
+    size_t total_size_new = 0;
+
+    std::vector<float> work;
+
+    std::vector<uint8_t>     data_u8;
+    std::vector<ggml_fp16_t> data_f16;
+    std::vector<float>       data_f32;
+
+    std::vector<int64_t> hist_all(1 << 4, 0);
+
+    while (true) {
+        int32_t n_dims;
+        int32_t length;
+        int32_t ttype;
+
+        finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+        finp.read(reinterpret_cast<char *>(&length), sizeof(length));
+        finp.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+
+        if (finp.eof()) {
+            break;
+        }
+
+        int32_t nelements = 1;
+        int32_t ne[2] = { 1, 1 };
+        for (int i = 0; i < n_dims; ++i) {
+            finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+            nelements *= ne[i];
+        }
+
+        std::string name(length, 0);
+        finp.read (&name[0], length);
+
+        printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype));
+
+        bool quantize = false;
+
+        // check if we should quantize this tensor
+        for (const auto & s : to_quant) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = true;
+                break;
+            }
+        }
+
+        // check if we should skip this tensor
+        for (const auto & s : to_skip) {
+            if (std::regex_match(name, std::regex(s))) {
+                quantize = false;
+                break;
+            }
+        }
+
+        // quantize only 2D tensors
+        quantize &= (n_dims == 2);
+
+        if (quantize) {
+            if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
+                fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                return false;
+            }
+
+            if (ttype == GGML_TYPE_F16) {
+                data_f16.resize(nelements);
+                finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
+                data_f32.resize(nelements);
+                for (int i = 0; i < nelements; ++i) {
+                    data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
+                }
+            } else {
+                data_f32.resize(nelements);
+                finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
+            }
+
+            ttype = qtype;
+        } else {
+            const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
+
+            data_u8.resize(nelements*bpe);
+            finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
+        }
+
+        fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+        fout.write(reinterpret_cast<char *>(&length), sizeof(length));
+        fout.write(reinterpret_cast<char *>(&ttype),  sizeof(ttype));
+        for (int i = 0; i < n_dims; ++i) {
+            fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+        }
+        fout.write(&name[0], length);
+
+        if (quantize) {
+            work.resize(nelements); // for quantization
+
+            size_t cur_size = 0;
+            std::vector<int64_t> hist_cur(1 << 4, 0);
+
+            switch ((ggml_type) ttype) {
+                case GGML_TYPE_Q4_0:
+                    {
+                        cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q4_1:
+                    {
+                        cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q4_2:
+                    {
+                        cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_0:
+                    {
+                        cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q5_1:
+                    {
+                        cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_Q8_0:
+                    {
+                        cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                    } break;
+                case GGML_TYPE_F32:
+                case GGML_TYPE_F16:
+                case GGML_TYPE_I8:
+                case GGML_TYPE_I16:
+                case GGML_TYPE_I32:
+                case GGML_TYPE_Q8_1:
+                case GGML_TYPE_COUNT:
+                    {
+                        fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+                        return false;
+                    }
+            }
+
+            fout.write(reinterpret_cast<char *>(work.data()), cur_size);
+            total_size_new += cur_size;
+
+            printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+            for (int i = 0; i < hist_cur.size(); ++i) {
+                hist_all[i] += hist_cur[i];
+            }
+
+            for (int i = 0; i < hist_cur.size(); ++i) {
+                printf("%5.3f ", hist_cur[i] / (float)nelements);
+            }
+            printf("\n");
+        } else {
+            printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
+            fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
+            total_size_new += data_u8.size();
+        }
+
+        total_size_org += nelements * sizeof(float);
+    }
+
+    printf("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
+    printf("%s: quant size  = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
+
+    {
+        int64_t sum_all = 0;
+        for (int i = 0; i < hist_all.size(); ++i) {
+            sum_all += hist_all[i];
+        }
+
+        printf("%s: hist: ", __func__);
+        for (int i = 0; i < hist_all.size(); ++i) {
+            printf("%5.3f ", hist_all[i] / (float)sum_all);
+        }
+        printf("\n");
+    }
+
+    return true;
+}
--- a/examples/common-ggml.h
+++ b/examples/common-ggml.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "ggml.h"
+
+#include <fstream>
+#include <vector>
+#include <string>
+
+enum ggml_ftype ggml_parse_ftype(const char * str);
+
+void ggml_print_ftypes(FILE * fp = stderr);
+
+bool ggml_common_quantize_0(
+        std::ifstream & finp,
+        std::ofstream & fout,
+        const ggml_ftype ftype,
+        const std::vector<std::string> & to_quant,
+        const std::vector<std::string> & to_skip);
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -6,13 +6,86 @@
 #include "dr_wav.h"

 #include <cmath>
-#include <cstdint>
+#include <fstream>
 #include <regex>

 #ifndef M_PI
 #define M_PI 3.14159265358979323846
 #endif

+bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-s" || arg == "--seed") {
+            params.seed = std::stoi(argv[++i]);
+        } else if (arg == "-t" || arg == "--threads") {
+            params.n_threads = std::stoi(argv[++i]);
+        } else if (arg == "-p" || arg == "--prompt") {
+            params.prompt = argv[++i];
+        } else if (arg == "-n" || arg == "--n_predict") {
+            params.n_predict = std::stoi(argv[++i]);
+        } else if (arg == "--top_k") {
+            params.top_k = std::stoi(argv[++i]);
+        } else if (arg == "--top_p") {
+            params.top_p = std::stof(argv[++i]);
+        } else if (arg == "--temp") {
+            params.temp = std::stof(argv[++i]);
+        } else if (arg == "-b" || arg == "--batch_size") {
+            params.n_batch = std::stoi(argv[++i]);
+        } else if (arg == "-m" || arg == "--model") {
+            params.model = argv[++i];
+        } else if (arg == "-h" || arg == "--help") {
+            gpt_print_usage(argc, argv, params);
+            exit(0);
+        } else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            gpt_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h, --help            show this help message and exit\n");
+    fprintf(stderr, "  -s SEED, --seed SEED  RNG seed (default: -1)\n");
+    fprintf(stderr, "  -t N, --threads N     number of threads to use during computation (default: %d)\n", params.n_threads);
+    fprintf(stderr, "  -p PROMPT, --prompt PROMPT\n");
+    fprintf(stderr, "                        prompt to start generation with (default: random)\n");
+    fprintf(stderr, "  -n N, --n_predict N   number of tokens to predict (default: %d)\n", params.n_predict);
+    fprintf(stderr, "  --top_k N             top-k sampling (default: %d)\n", params.top_k);
+    fprintf(stderr, "  --top_p N             top-p sampling (default: %.1f)\n", params.top_p);
+    fprintf(stderr, "  --temp N              temperature (default: %.1f)\n", params.temp);
+    fprintf(stderr, "  -b N, --batch_size N  batch size for prompt processing (default: %d)\n", params.n_batch);
+    fprintf(stderr, "  -m FNAME, --model FNAME\n");
+    fprintf(stderr, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stderr, "\n");
+}
+
+std::string gpt_random_prompt(std::mt19937 & rng) {
+    const int r = rng() % 10;
+    switch (r) {
+        case 0: return "So";
+        case 1: return "Once upon a time";
+        case 2: return "When";
+        case 3: return "The";
+        case 4: return "After";
+        case 5: return "If";
+        case 6: return "import";
+        case 7: return "He";
+        case 8: return "She";
+        case 9: return "They";
+        default: return "To";
+    }
+
+    return "The";
+}
+
 std::string trim(const std::string & s) {
    std::regex e("^\\s+|\\s+$");
    return std::regex_replace(s, e, "");
@@ -28,6 +101,251 @@ std::string replace(const std::string & s, const std::string & from, const std::
    return result;
 }

+std::map<std::string, int32_t> json_parse(const std::string & fname) {
+    std::map<std::string, int32_t> result;
+
+    // read file into string
+    std::string json;
+    {
+        std::ifstream ifs(fname);
+        if (!ifs) {
+            fprintf(stderr, "Failed to open %s\n", fname.c_str());
+            exit(1);
+        }
+
+        json = std::string((std::istreambuf_iterator<char>(ifs)),
+                (std::istreambuf_iterator<char>()));
+    }
+
+    if (json[0] != '{') {
+        return result;
+    }
+
+    // parse json
+    {
+        bool has_key  = false;
+        bool in_token = false;
+
+        std::string str_key = "";
+        std::string str_val = "";
+
+        int n = json.size();
+        for (int i = 1; i < n; ++i) {
+            if (!in_token) {
+                if (json[i] == ' ') continue;
+                if (json[i] == '"') {
+                    in_token = true;
+                    continue;
+                }
+            } else {
+                if (json[i] == '\\' && i+1 < n) {
+                    if (has_key == false) {
+                        str_key += json[i];
+                    } else {
+                        str_val += json[i];
+                    }
+                    ++i;
+                } else if (json[i] == '"') {
+                    if (has_key == false) {
+                        has_key = true;
+                        ++i;
+                        while (json[i] == ' ') ++i;
+                        ++i; // :
+                        while (json[i] == ' ') ++i;
+                        if (json[i] != '\"') {
+                            while (json[i] != ',' && json[i] != '}') {
+                                str_val += json[i++];
+                            }
+                            has_key = false;
+                        } else {
+                            in_token = true;
+                            continue;
+                        }
+                    } else {
+                        has_key = false;
+                    }
+
+                    str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
+                    str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
+                    str_key = ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
+
+                    try {
+                        result[str_key] = std::stoi(str_val);
+                    } catch (...) {
+                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
+
+                    }
+                    str_key = "";
+                    str_val = "";
+                    in_token = false;
+                    continue;
+                }
+                if (has_key == false) {
+                    str_key += json[i];
+                } else {
+                    str_val += json[i];
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
+    std::vector<std::string> words;
+
+    // first split the text into words
+    {
+        std::string str = text;
+        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+
+        std::regex re(pat);
+        std::smatch m;
+
+        while (std::regex_search(str, m, re)) {
+            for (auto x : m) {
+                words.push_back(x);
+            }
+            str = m.suffix();
+        }
+    }
+
+    // find the longest tokens that form the words:
+    std::vector<gpt_vocab::id> tokens;
+    for (const auto & word : words) {
+        if (word.size() == 0) continue;
+
+        int i = 0;
+        int n = word.size();
+        while (i < n) {
+            int j = n;
+            while (j > i) {
+                auto it = vocab.token_to_id.find(word.substr(i, j-i));
+                if (it != vocab.token_to_id.end()) {
+                    tokens.push_back(it->second);
+                    i = j;
+                    break;
+                }
+                --j;
+            }
+            if (i == n) {
+                break;
+            }
+            if (j == i) {
+                auto sub = word.substr(i, 1);
+                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
+                    tokens.push_back(vocab.token_to_id.at(sub));
+                } else {
+                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
+                }
+                ++i;
+            }
+        }
+    }
+
+    return tokens;
+}
+
+bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
+    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
+
+    vocab.token_to_id = ::json_parse(fname);
+
+    for (const auto & kv : vocab.token_to_id) {
+        vocab.id_to_token[kv.second] = kv.first;
+    }
+
+    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
+
+    // print the vocabulary
+    //for (auto kv : vocab.token_to_id) {
+    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
+    //}
+
+    return true;
+}
+
+gpt_vocab::id gpt_sample_top_k_top_p(
+        const gpt_vocab & vocab,
+        const float * logits,
+        int    top_k,
+        double top_p,
+        double temp,
+        std::mt19937 & rng) {
+    int n_logits = vocab.id_to_token.size();
+
+    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
+    logits_id.reserve(n_logits);
+
+    {
+        const double scale = 1.0/temp;
+        for (int i = 0; i < n_logits; ++i) {
+            logits_id.push_back(std::make_pair(logits[i]*scale, i));
+        }
+    }
+
+    // find the top K tokens
+    std::partial_sort(
+            logits_id.begin(),
+            logits_id.begin() + top_k, logits_id.end(),
+            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
+        return a.first > b.first;
+    });
+
+    logits_id.resize(top_k);
+
+    double maxl = -INFINITY;
+    for (const auto & kv : logits_id) {
+        maxl = std::max(maxl, kv.first);
+    }
+
+    // compute probs for the top K tokens
+    std::vector<double> probs;
+    probs.reserve(logits_id.size());
+
+    double sum = 0.0;
+    for (const auto & kv : logits_id) {
+        double p = exp(kv.first - maxl);
+        probs.push_back(p);
+        sum += p;
+    }
+
+    // normalize the probs
+    for (auto & p : probs) {
+        p /= sum;
+    }
+
+    if (top_p < 1.0f) {
+        double cumsum = 0.0f;
+        for (int i = 0; i < top_k; i++) {
+            cumsum += probs[i];
+            if (cumsum >= top_p) {
+                top_k = i + 1;
+                probs.resize(top_k);
+                logits_id.resize(top_k);
+                break;
+            }
+        }
+
+        cumsum = 1.0/cumsum;
+        for (int i = 0; i < (int) probs.size(); i++) {
+            probs[i] *= cumsum;
+        }
+    }
+
+    //printf("\n");
+    //for (int i = 0; i < (int) probs.size(); i++) {
+    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
+    //}
+    //exit(0);
+
+    std::discrete_distribution<> dist(probs.begin(), probs.end());
+    int idx = dist(rng);
+
+    return logits_id[idx].second;
+}
+
 bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
    drwav wav;
    std::vector<uint8_t> wav_data; // used for pipe input from stdin
--- a/examples/common.h
+++ b/examples/common.h
@@ -1,10 +1,44 @@
+// Various helper functions and utilities
+
 #pragma once

-// needs to match WHISPER_SAMPLE_RATE
+#include <string>
+#include <map>
+#include <vector>
+#include <random>
+#include <thread>
+
 #define COMMON_SAMPLE_RATE 16000

-#include <vector>
-#include <string>
+//
+// CLI argument parsing
+//
+
+struct gpt_params {
+    int32_t seed      = -1; // RNG seed
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_predict = 200; // new tokens to predict
+
+    // sampling parameters
+    int32_t top_k = 40;
+    float   top_p = 0.9f;
+    float   temp  = 0.9f;
+
+    int32_t n_batch = 8; // batch size for prompt processing
+
+    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
+    std::string prompt;
+};
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+//
+// Vocab utils
+//

 std::string trim(const std::string & s);

@@ -13,6 +47,52 @@ std::string replace(
        const std::string & from,
        const std::string & to);

+struct gpt_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+};
+
+// poor-man's JSON parsing
+std::map<std::string, int32_t> json_parse(const std::string & fname);
+
+// split text into tokens
+//
+// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
+//
+// Regex (Python):
+// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+//
+// Regex (C++):
+// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
+//
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
+
+// load the tokens from encoder.json
+bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
+
+// sample next token given probabilities for each embedding
+//
+//   - consider only the top K tokens
+//   - from them, consider only the top tokens with cumulative probability > P
+//
+// TODO: not sure if this implementation is correct
+// TODO: temperature is not implemented
+//
+gpt_vocab::id gpt_sample_top_k_top_p(
+        const gpt_vocab & vocab,
+        const float * logits,
+        int    top_k,
+        double top_p,
+        double temp,
+        std::mt19937 & rng);
+
+//
+// Audio utils
+//
+
 // Read WAV audio file and store the PCM data into pcmf32
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
--- a/examples/helpers.js
+++ b/examples/helpers.js
@@ -145,7 +145,15 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
                            var db = event.target.result;
                            var tx = db.transaction(['models'], 'readwrite');
                            var os = tx.objectStore('models');
-                            var rq = os.put(data, url);
+
+                            var rq = null;
+                            try {
+                                var rq = os.put(data, url);
+                            } catch (e) {
+                                cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB: \n' + e);
+                                cbCancel();
+                                return;
+                            }

                            rq.onsuccess = function (event) {
                                cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
@@ -180,7 +188,6 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {

    rq.onabort = function (event) {
        cbPrint('loadRemote: failed to open IndexedDB: abort');
-
+        cbCancel();
    };
 }
-
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -496,7 +496,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
                value_i("layer", whisper_model_n_text_layer(ctx), true);
            end_obj();
            value_i("mels", whisper_model_n_mels(ctx));
-            value_i("f16", whisper_model_f16(ctx), true);
+            value_i("ftype", whisper_model_ftype(ctx), true);
        end_obj();
        start_obj("params");
            value_s("model", params.model.c_str());
--- a/examples/quantize/CMakeLists.txt
+++ b/examples/quantize/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(TARGET quantize)
+add_executable(${TARGET} quantize.cpp)
+
+include(DefaultTargetOptions)
+
+target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
--- a/examples/quantize/README.md
+++ b/examples/quantize/README.md
@@ -0,0 +1,3 @@
+# quantize
+
+Tool for integer quantization of Whisper `ggml` model files
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -0,0 +1,215 @@
+#include "ggml.h"
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <regex>
+
+// default hparams (Whisper tiny)
+struct whisper_hparams {
+    int32_t n_vocab       = 51864;
+    int32_t n_audio_ctx   = 1500;
+    int32_t n_audio_state = 384;
+    int32_t n_audio_head  = 6;
+    int32_t n_audio_layer = 4;
+    int32_t n_text_ctx    = 448;
+    int32_t n_text_state  = 384;
+    int32_t n_text_head   = 6;
+    int32_t n_text_layer  = 4;
+    int32_t n_mels        = 80;
+    int32_t f16           = 1;
+};
+
+struct whisper_filters {
+    int32_t n_mel;
+    int32_t n_fft;
+
+    std::vector<float> data;
+};
+
+// quantize a model
+bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
+    gpt_vocab vocab;
+
+    printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
+
+    auto finp = std::ifstream(fname_inp, std::ios::binary);
+    if (!finp) {
+        fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    auto fout = std::ofstream(fname_out, std::ios::binary);
+    if (!fout) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
+        return false;
+    }
+
+    // verify magic
+    {
+        uint32_t magic;
+        finp.read((char *) &magic, sizeof(magic));
+        if (magic != 0x67676d6c) {
+            fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
+            return false;
+        }
+
+        fout.write((char *) &magic, sizeof(magic));
+    }
+
+    whisper_hparams hparams;
+
+    // load hparams
+    {
+        finp.read((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
+        finp.read((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
+        finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
+        finp.read((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
+        finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
+        finp.read((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
+        finp.read((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
+        finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
+        finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
+        finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
+        finp.read((char *) &hparams.f16,           sizeof(hparams.f16));
+
+        fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
+        fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
+        fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
+        fprintf(stderr, "%s: n_audio_head  = %d\n", __func__, hparams.n_audio_head);
+        fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
+        fprintf(stderr, "%s: n_text_ctx    = %d\n", __func__, hparams.n_text_ctx);
+        fprintf(stderr, "%s: n_text_state  = %d\n", __func__, hparams.n_text_state);
+        fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
+        fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
+        fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
+        fprintf(stderr, "%s: f16           = %d\n", __func__, hparams.f16);
+
+        fout.write((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
+        fout.write((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
+        fout.write((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
+        fout.write((char *) &hparams.n_audio_head,  sizeof(hparams.n_audio_head));
+        fout.write((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
+        fout.write((char *) &hparams.n_text_ctx,    sizeof(hparams.n_text_ctx));
+        fout.write((char *) &hparams.n_text_state,  sizeof(hparams.n_text_state));
+        fout.write((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
+        fout.write((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
+        fout.write((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
+        fout.write((char *) &ftype,                 sizeof(hparams.f16));
+    }
+
+    // load mel filters
+    {
+        whisper_filters filters;
+
+        finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
+        fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
+        finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
+        fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
+
+        filters.data.resize(filters.n_mel * filters.n_fft);
+        finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
+        fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
+    }
+
+    // load vocab
+    {
+        int32_t n_vocab = 0;
+        finp.read ((char *) &n_vocab, sizeof(n_vocab));
+        fout.write((char *) &n_vocab, sizeof(n_vocab));
+
+        //if (n_vocab != hparams.n_vocab) {
+        //    fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+        //            __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
+        //    return false;
+        //}
+
+        std::string word;
+        for (int i = 0; i < n_vocab; i++) {
+            uint32_t len;
+            finp.read ((char *) &len, sizeof(len));
+            fout.write((char *) &len, sizeof(len));
+
+            word.resize(len);
+            finp.read ((char *) word.data(), len);
+            fout.write((char *) word.data(), len);
+
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    // regexes of tensor names to not be quantized
+    const std::vector<std::string> to_skip = {
+        //"encoder.*",
+        "encoder.conv1.bias",
+        "encoder.conv2.bias",
+        "encoder.positional_embedding",
+        "decoder.positional_embedding",
+    };
+
+    if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
+        fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+        return false;
+    }
+
+    finp.close();
+    fout.close();
+
+    return true;
+}
+
+int main(int argc, char ** argv) {
+    if (argc != 4) {
+        fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
+        ggml_print_ftypes(stderr);
+        return 1;
+    }
+
+    // needed to initialize f16 tables
+    {
+        struct ggml_init_params params = { 0, NULL, false };
+        struct ggml_context * ctx = ggml_init(params);
+        ggml_free(ctx);
+    }
+
+    const std::string fname_inp = argv[1];
+    const std::string fname_out = argv[2];
+
+    const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
+
+    const int64_t t_main_start_us = ggml_time_us();
+
+    int64_t t_quantize_us = 0;
+
+    // load the model
+    {
+        const int64_t t_start_us = ggml_time_us();
+
+        if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
+            fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
+            return 1;
+        }
+
+        t_quantize_us = ggml_time_us() - t_start_us;
+    }
+
+    // report timing
+    {
+        const int64_t t_main_end_us = ggml_time_us();
+
+        printf("\n");
+        printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
+        printf("%s:    total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+    }
+
+    return 0;
+}
--- a/examples/stream.wasm/index-tmpl.html
+++ b/examples/stream.wasm/index-tmpl.html
@@ -35,6 +35,15 @@

            <br><br>

+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <br><br>
+
            <hr>

            Select the model you would like to use, click the "Start" button and start speaking
@@ -45,6 +54,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>

                <!--
@@ -162,11 +175,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };

                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
+
+                    'tiny-en-q5_1':   31,
+                    'base-en-q5_1':   57,
                };

                let url     = urls[model];
@@ -177,6 +196,10 @@

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
+
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';

                cbProgress = function(p) {
@@ -188,6 +211,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
+
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };

--- a/examples/talk-llama/llama-util.h
+++ b/examples/talk-llama/llama-util.h
@@ -21,12 +21,17 @@
        #if defined(_POSIX_MAPPED_FILES)
            #include <sys/mman.h>
        #endif
+        #if defined(_POSIX_MEMLOCK_RANGE)
+            #include <sys/resource.h>
+        #endif
    #endif
 #endif

 #if defined(_WIN32)
    #define WIN32_LEAN_AND_MEAN
-    #define NOMINMAX
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h> // for _fseeki64
@@ -41,8 +46,12 @@
    } while (0)

 #ifdef __GNUC__
+#ifdef __MINGW32__
+__attribute__((format(gnu_printf, 1, 2)))
+#else
 __attribute__((format(printf, 1, 2)))
 #endif
+#endif
 static std::string format(const char * fmt, ...) {
    va_list ap, ap2;
    va_start(ap, fmt);
@@ -55,7 +64,7 @@ static std::string format(const char * fmt, ...) {
    va_end(ap2);
    va_end(ap);
    return std::string(buf.data(), size);
-};
+}

 struct llama_file {
    // use FILE * so we don't have to re-open the file to mmap
@@ -162,7 +171,7 @@ struct llama_mmap {
 #ifdef _POSIX_MAPPED_FILES
    static constexpr bool SUPPORTED = true;

-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;
        int fd = fileno(file->fp);
        int flags = MAP_SHARED;
@@ -170,15 +179,16 @@ struct llama_mmap {
        flags |= MAP_POPULATE;
 #endif
        addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
-        close(fd);
        if (addr == MAP_FAILED) {
            throw format("mmap failed: %s", strerror(errno));
        }

-        // Advise the kernel to preload the mapped memory
-        if (madvise(addr, file->size, MADV_WILLNEED)) {
-            fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
-                    strerror(errno));
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            if (madvise(addr, file->size, MADV_WILLNEED)) {
+                fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
+                        strerror(errno));
+            }
        }
    }

@@ -188,14 +198,13 @@ struct llama_mmap {
 #elif defined(_WIN32)
    static constexpr bool SUPPORTED = true;

-    llama_mmap(struct llama_file * file) {
+    llama_mmap(struct llama_file * file, bool prefetch = true) {
        size = file->size;

        HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));

        HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
        DWORD error = GetLastError();
-        CloseHandle(hFile);

        if (hMapping == NULL) {
            throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@@ -209,14 +218,20 @@ struct llama_mmap {
            throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
        }

-        // Advise the kernel to preload the mapped memory
-        WIN32_MEMORY_RANGE_ENTRY range;
-        range.VirtualAddress = addr;
-        range.NumberOfBytes = (SIZE_T)size;
-        if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
-            fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
-                    llama_format_win_err(GetLastError()).c_str());
+        #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
+        if (prefetch) {
+            // Advise the kernel to preload the mapped memory
+            WIN32_MEMORY_RANGE_ENTRY range;
+            range.VirtualAddress = addr;
+            range.NumberOfBytes = (SIZE_T)size;
+            if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
+                fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
+                        llama_format_win_err(GetLastError()).c_str());
+            }
        }
+        #else
+        #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
+        #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
    }

    ~llama_mmap() {
@@ -291,8 +306,18 @@ struct llama_mlock {
        if (!mlock(addr, size)) {
            return true;
        } else {
-            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION,
-                    size, this->size, std::strerror(errno));
+            char* errmsg = std::strerror(errno);
+            bool suggest = (errno == ENOMEM);
+
+            // Check if the resource limit is fine after all
+            struct rlimit lock_limit;
+            if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
+                suggest = false;
+            if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
+                suggest = false;
+
+            fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
+                    size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
            return false;
        }
    }
@@ -338,8 +363,8 @@ struct llama_mlock {
            // Hopefully a megabyte is enough overhead:
            size_t increment = size + 1048576;
            // The minimum must be <= the maximum, so we need to increase both:
-            min_ws_size += size;
-            max_ws_size += size;
+            min_ws_size += increment;
+            max_ws_size += increment;
            if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
                fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
                        llama_format_win_err(GetLastError()).c_str());
@@ -380,4 +405,29 @@ struct llama_buffer {
        delete[] addr;
    }
 };
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+struct llama_ctx_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        if (addr) {
+            ggml_cuda_host_free(addr);
+        }
+        addr = (uint8_t *) ggml_cuda_host_malloc(size);
+        this->size = size;
+    }
+
+    ~llama_ctx_buffer() {
+        if (addr) {
+            ggml_cuda_host_free(addr);
+        }
+    }
+};
+#else
+typedef llama_buffer llama_ctx_buffer;
+#endif
+
 #endif
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@@ -39,12 +39,16 @@ extern "C" {

    typedef struct llama_token_data {
        llama_token id;  // token id
-
+        float logit; // log-odds of the token
        float p;     // probability of the token
-        float plog;  // log probability of the token
-
    } llama_token_data;

+    typedef struct llama_token_data_array {
+        llama_token_data * data;
+        size_t size;
+        bool sorted;
+    } llama_token_data_array;
+
    typedef void (*llama_progress_callback)(float progress, void *ctx);

    struct llama_context_params {
@@ -65,6 +69,20 @@ extern "C" {
        void * progress_callback_user_data;
    };

+    // model file types
+    enum llama_ftype {
+        LLAMA_FTYPE_ALL_F32     = 0,
+        LLAMA_FTYPE_MOSTLY_F16  = 1,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_0 = 2,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1 = 3,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+        LLAMA_FTYPE_MOSTLY_Q4_2 = 5,  // except 1d tensors
+        // LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
+        LLAMA_FTYPE_MOSTLY_Q8_0 = 7,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_0 = 8,  // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q5_1 = 9,  // except 1d tensors
+    };
+
    LLAMA_API struct llama_context_params llama_context_default_params();

    LLAMA_API bool llama_mmap_supported();
@@ -82,27 +100,46 @@ extern "C" {

    // TODO: not great API - very likely to change
    // Returns 0 on success
+    // nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
-                   int   itype);
+      enum llama_ftype   ftype,
+            int          nthread);

-    // Returns the KV cache that will contain the context for the
-    // ongoing prediction with the model.
-    LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx);
-
-    // Returns the size of the KV cache
-    LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx);
+    // Apply a LoRA adapter to a loaded model
+    // path_base_model is the path to a higher quality model to use as a base for
+    // the layers modified by the adapter. Can be NULL to use the current loaded model.
+    // The model needs to be reloaded before applying a new adapter, otherwise the adapter
+    // will be applied on top of the previous one
+    // Returns 0 on success
+    LLAMA_API int llama_apply_lora_from_file(
+            struct llama_context * ctx,
+                      const char * path_lora,
+                      const char * path_base_model,
+                             int   n_threads);

    // Returns the number of tokens in the KV cache
    LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);

-    // Sets the KV cache containing the current context for the model
-    LLAMA_API void llama_set_kv_cache(
-            struct llama_context * ctx,
-                   const uint8_t * kv_cache,
-                          size_t   n_size,
-                             int   n_token_count);
+    // Sets the current rng seed.
+    LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
+
+    // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
+    LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
+
+    // Copies the state to the specified destination address.
+    // Destination needs to have allocated enough memory.
+    // Returns the number of bytes copied
+    LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
+
+    // Set the state reading from the specified address
+    // Returns the number of bytes read
+    LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
+
+    // Save/load session file
+    LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
+    LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);

    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
@@ -148,16 +185,52 @@ extern "C" {
    // Special tokens
    LLAMA_API llama_token llama_token_bos();
    LLAMA_API llama_token llama_token_eos();
+    LLAMA_API llama_token llama_token_nl();

-    // TODO: improve the last_n_tokens interface ?
-    LLAMA_API llama_token llama_sample_top_p_top_k(
-       struct llama_context * ctx,
-          const llama_token * last_n_tokens_data,
-                        int   last_n_tokens_size,
-                        int   top_k,
-                      float   top_p,
-                      float   temp,
-                      float   repeat_penalty);
+    // Sampling functions
+
+    /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
+    LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
+
+    /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
+    LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
+
+    /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
+    LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
+
+    /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
+
+    /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
+    LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+
+    /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
+    LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
+
+    /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
+    LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
+    LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
+
+    /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
+
+    /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
+    /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
+    /// @param tau  The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
+    /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
+    /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
+    LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
+
+    /// @details Selects the token with the highest probability.
+    LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
+
+    /// @details Randomly selects a token from the candidates based on their probabilities.
+    LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);

    // Performance information
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
@@ -170,4 +243,15 @@ extern "C" {
 }
 #endif

+// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
+#ifdef LLAMA_API_INTERNAL
+
+#include <vector>
+#include <string>
+struct ggml_tensor;
+
+std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
+
+#endif
+
 #endif // LLAMA_H
--- a/examples/talk-llama/llama_internal.h
+++ b/examples/talk-llama/llama_internal.h
@@ -1,12 +0,0 @@
-// Internal header to be included by llama.cpp and tests/benchmarks only.
-
-#ifndef LLAMA_INTERNAL_H
-#define LLAMA_INTERNAL_H
-
-#include <vector>
-#include <string>
-struct ggml_tensor;
-
-std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
-
-#endif // LLAMA_INTERNAL_H
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@@ -487,11 +487,37 @@ int main(int argc, char ** argv) {

                        {
                            auto logits = llama_get_logits(ctx_llama);
+                            auto n_vocab = llama_n_vocab(ctx_llama);
+
                            logits[llama_token_eos()] = 0;

-                            id = llama_sample_top_p_top_k(ctx_llama,
+                            std::vector<llama_token_data> candidates;
+                            candidates.reserve(n_vocab);
+                            for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
+                                candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+                            }
+
+                            llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
+
+                            // apply repeat penalty
+                            const float nl_logit = logits[llama_token_nl()];
+
+                            llama_sample_repetition_penalty(ctx_llama, &candidates_p,
                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
-                                    repeat_last_n, top_k, top_p, temp, repeat_penalty);
+                                    repeat_last_n, repeat_penalty);
+
+                            logits[llama_token_nl()] = nl_logit;
+
+                            if (temp <= 0) {
+                                // Greedy sampling
+                                id = llama_sample_token_greedy(ctx_llama, &candidates_p);
+                            } else {
+                                // Temperature sampling
+                                llama_sample_top_k(ctx_llama, &candidates_p, top_k);
+                                llama_sample_top_p(ctx_llama, &candidates_p, top_p);
+                                llama_sample_temperature(ctx_llama, &candidates_p, temp);
+                                id = llama_sample_token(ctx_llama, &candidates_p);
+                            }
                        }

                        if (id != llama_token_eos()) {
--- a/examples/talk.wasm/CMakeLists.txt
+++ b/examples/talk.wasm/CMakeLists.txt
@@ -13,6 +13,7 @@ include(DefaultTargetOptions)

 target_link_libraries(${TARGET} PRIVATE
    whisper
+    common
    )

 unset(EXTRA_FLAGS)
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@@ -1,4 +1,6 @@
 #include "ggml.h"
+#include "common-ggml.h"
+
 #include "gpt-2.h"

 #include <cmath>
@@ -14,150 +16,6 @@

 /////////////////////// GPT-2 BEGIN /////////////////////////

-//
-// Vocab utils
-//
-
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-
-        std::regex re(pat);
-        std::smatch m;
-
-        while (std::regex_search(str, m, re)) {
-            for (auto x : m) {
-                words.push_back(x);
-            }
-            str = m.suffix();
-        }
-    }
-
-    // find the longest tokens that form the words:
-    std::vector<gpt_vocab::id> tokens;
-    for (const auto & word : words) {
-        if (word.size() == 0) continue;
-
-        int i = 0;
-        int n = word.size();
-        while (i < n) {
-            int j = n;
-            while (j > i) {
-                auto it = vocab.token_to_id.find(word.substr(i, j-i));
-                if (it != vocab.token_to_id.end()) {
-                    tokens.push_back(it->second);
-                    i = j;
-                    break;
-                }
-                --j;
-            }
-            if (i == n) {
-                break;
-            }
-            if (j == i) {
-                auto sub = word.substr(i, 1);
-                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
-                    tokens.push_back(vocab.token_to_id.at(sub));
-                } else {
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
-                }
-                ++i;
-            }
-        }
-    }
-
-    return tokens;
-}
-
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const gpt_vocab & vocab,
-        const float * logits,
-        int    top_k,
-        double top_p,
-        double temp,
-        std::mt19937 & rng) {
-    int n_logits = vocab.id_to_token.size();
-
-    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    for (int i = 0; i < n_logits; i++) {
-        logits_id.push_back(std::make_pair(logits[i], i));
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    // normalize
-    {
-        double sum = 0.0f;
-        for (int i = 0; i < (int)logits_id.size(); i++) {
-            sum += logits_id[i].first;
-        }
-
-        sum = 1.0/sum;
-        for (int i = 0; i < (int)logits_id.size(); i++) {
-            logits_id[i].first *= sum;
-        }
-    }
-
-    if (top_p < 1.0f) {
-        {
-            double cumsum = 0.0f;
-            for (int i = 0; i < top_k; i++) {
-                cumsum += logits_id[i].first;
-                if (cumsum >= top_p) {
-                    logits_id.resize(i+1);
-                    break;
-                }
-            }
-        }
-
-        // normalize again
-        {
-            double sum = 0.0f;
-            for (int i = 0; i < (int)logits_id.size(); i++) {
-                sum += logits_id[i].first;
-            }
-
-            sum = 1.0/sum;
-            for (int i = 0; i < (int)logits_id.size(); i++) {
-                logits_id[i].first *= sum;
-            }
-        }
-    }
-
-    //printf("\n");
-    //for (int i = 0; i < (int)logits_id.size(); i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
-    //}
-    //exit(0);
-
-    // sample from the obtained distribution
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    for (int i = 0; i < (int) logits_id.size(); i++) {
-        probs.push_back(logits_id[i].first);
-    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-}
-
 // default hparams (GPT-2 117M)
 struct gpt2_hparams {
    int32_t n_vocab = 50257;
@@ -165,7 +23,7 @@ struct gpt2_hparams {
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };

 struct gpt2_layer {
@@ -187,7 +45,7 @@ struct gpt2_layer {
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;

-    struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
+    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
 };

@@ -198,8 +56,9 @@ struct gpt2_model {
    struct ggml_tensor * ln_f_g;
    struct ggml_tensor * ln_f_b;

-    struct ggml_tensor * wte; // position embedding
-    struct ggml_tensor * wpe; //    token embedding
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head

    std::vector<gpt2_layer> layers;

@@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));

        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    }

    // load vocab
@@ -275,9 +134,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        }
    }

-    // for the big tensors, we have the option to store the data in 16-bit floats
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }

    auto & ctx = model.ctx;

@@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

-        ctx_size += n_vocab*n_embd*ggml_type_size(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head

-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b

-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b

-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b

-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_size(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = NULL;
+        struct ggml_init_params params = {
+            .mem_size   = ctx_size,
+            .mem_buffer = NULL,
+            .no_alloc   = false,
+        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@@ -350,36 +217,38 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

-        model.wte = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);

        // map by name
        model.tensors["model/ln_f/g"] = model.ln_f_g;
        model.tensors["model/ln_f/b"] = model.ln_f_b;

-        model.tensors["model/wte"] = model.wte;
-        model.tensors["model/wpe"] = model.wpe;
+        model.tensors["model/wte"]     = model.wte;
+        model.tensors["model/wpe"]     = model.wpe;
+        model.tensors["model/lm_head"] = model.lm_head;

        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];

-            layer.ln_1_g             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

-            layer.ln_2_g             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

-            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
-            layer.c_attn_attn_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);

-            layer.c_attn_proj_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

-            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_fc_b         = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);

-            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            // map by name
            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
@@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;

-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
        }
    }
@@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    {
        size_t total_size = 0;

+        bool has_lm_head = false;
+
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;

            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));

            if (fin.eof()) {
                break;
@@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                return false;
            }

-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }

-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

-            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            // GPT-2 models share the WTE tensor as the LM head
+            if (name == "model/wte" && has_lm_head == false) {
+                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            }
+
+            if (name == "model/lm_head") {
+                has_lm_head = true;
+            }
+
            total_size += ggml_nbytes(tensor);
        }

@@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted probabilities of the next token
+//   - embd_w:    the predicted logits for the next token
 //
 bool gpt2_eval(
        const gpt2_model & model,
@@ -512,12 +396,12 @@ bool gpt2_eval(
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;

-    static size_t buf_size = 640u*1024*1024;
+    static size_t buf_size = 512u*1024*1024;
    static void * buf = malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);

        // reallocate
        buf_size = buf_size_new;
@@ -528,13 +412,14 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_cgraph gf = { };
+    struct ggml_cgraph gf = {};
    gf.n_threads = n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -578,7 +463,7 @@ bool gpt2_eval(
        // [2304, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
+                    model.layers[il].c_attn_attn_w,
                    cur);

            cur = ggml_add(ctx0,
@@ -654,11 +539,13 @@ bool gpt2_eval(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            // [n_past + N, 64, 12]
            struct ggml_tensor * V_trans =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));

            // KQV = transpose(V) * KQ_soft_max
            // [64, N, 12]
@@ -685,7 +572,7 @@ bool gpt2_eval(
        // [768, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    model.layers[il].c_attn_proj_w,
                    cur);

            cur = ggml_add(ctx0,
@@ -722,7 +609,7 @@ bool gpt2_eval(
            // cur = fc_w*cur + fc_b
            // [3072, N]
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    model.layers[il].c_mlp_fc_w,
                    cur);

            cur = ggml_add(ctx0,
@@ -742,7 +629,7 @@ bool gpt2_eval(
            // cur = proj_w*cur + proj_b
            // [768, N]
            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w_trans,
+                    model.layers[il].c_mlp_proj_w,
                    cur);

            cur = ggml_add(ctx0,
@@ -769,12 +656,12 @@ bool gpt2_eval(
    }

    // inpL = WTE * inpL
-    // [ 768, 50257] - model.wte
+    // [ 768, 50257] - model.lm_head
    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.wte, inpL);
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);

    // logits -> probs
-    inpL = ggml_soft_max(ctx0, inpL);
+    //inpL = ggml_soft_max(ctx0, inpL);

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@@ -788,7 +675,7 @@ bool gpt2_eval(
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);

-    // return result for just the last token
+    // return result just for the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);

@@ -825,7 +712,7 @@ Me too.
    int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());

    // sampling parameters
-    int32_t top_k = 40;
+    int32_t top_k = 5;
    float   top_p = 0.9f;
    float   temp  = 1.0f;
 };
@@ -833,14 +720,14 @@ Me too.
 struct gpt2_context * gpt2_init(const char * path_model) {
    gpt2_context * ctx = new gpt2_context;

-    ctx->rng = std::mt19937(time(NULL));
+    ctx->rng = std::mt19937(time(nullptr));

    // load the model
    {
        const int64_t t_start_us = ggml_time_us();

        if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
-            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin");
+            fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
            delete ctx;
            return nullptr;
        }
@@ -885,9 +772,9 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)

    std::string result;

-    for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) {
+    for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
        // predict
-        if (embd.size() > 0) {
+        if (!embd.empty()) {
            if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
                printf("gpt-2: failed to generate text\n");
                return "";
@@ -914,10 +801,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
        result += ctx->vocab.id_to_token[embd[0]];

        // end of text token
-        if (embd.back() == 50256 ||
-            ctx->vocab.id_to_token[embd.back()] == "." ||
-            ctx->vocab.id_to_token[embd.back()] == "!" ||
-            ctx->vocab.id_to_token[embd.back()] == "?") {
+        if (embd.back() == 50256) {
            break;
        }
    }
--- a/examples/talk.wasm/gpt-2.h
+++ b/examples/talk.wasm/gpt-2.h
@@ -2,18 +2,12 @@

 // TODO: Change to C-style API and move to ./examples for easy reuse.

+#include "common.h"
+
 #include <vector>
 #include <map>
 #include <string>

-struct gpt_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-};
-
 struct gpt2_context;

 struct gpt2_context * gpt2_init(const char * path_model);
--- a/examples/talk.wasm/index-tmpl.html
+++ b/examples/talk.wasm/index-tmpl.html
@@ -44,6 +44,15 @@

            <br><br>

+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <br><br>
+
            <hr>

            Select the models you would like to use and click the "Start" button to begin the conversation
@@ -54,6 +63,10 @@
                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
                <span id="fetch-whisper-progress"></span>

                <!--
@@ -266,11 +279,17 @@
                let urls = {
                    'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
                    'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
                };

                let sizes = {
                    'tiny.en': 75,
                    'base.en': 142,
+
+                    'tiny-en-q5_1':   31,
+                    'base-en-q5_1':   57,
                };

                let url     = urls[model];
@@ -281,6 +300,10 @@

                document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
                document.getElementById('fetch-whisper-base-en').style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
+
                document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';

                cbProgress = function(p) {
@@ -292,6 +315,10 @@
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
+
                    el = document.getElementById('model-whisper-status');  if (el) el.innerHTML = '';
                };

--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@@ -1,16 +1,8 @@
 if (WHISPER_SDL2)
    # talk
    set(TARGET talk)
-    #add_executable(${TARGET} talk.cpp gpt-2.cpp)
-    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
-    #target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-
-    # TODO: this is temporary
-    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
+    add_executable(${TARGET} talk.cpp gpt-2.cpp)
+    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})

    include(DefaultTargetOptions)
-
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@@ -1,4 +1,6 @@
 #include "ggml.h"
+#include "common-ggml.h"
+
 #include "gpt-2.h"

 #include <cmath>
@@ -14,150 +16,6 @@

 /////////////////////// GPT-2 BEGIN /////////////////////////

-//
-// Vocab utils
-//
-
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-
-        std::regex re(pat);
-        std::smatch m;
-
-        while (std::regex_search(str, m, re)) {
-            for (auto x : m) {
-                words.push_back(x);
-            }
-            str = m.suffix();
-        }
-    }
-
-    // find the longest tokens that form the words:
-    std::vector<gpt_vocab::id> tokens;
-    for (const auto & word : words) {
-        if (word.empty()) continue;
-
-        int i = 0;
-        int n = word.size();
-        while (i < n) {
-            int j = n;
-            while (j > i) {
-                auto it = vocab.token_to_id.find(word.substr(i, j-i));
-                if (it != vocab.token_to_id.end()) {
-                    tokens.push_back(it->second);
-                    i = j;
-                    break;
-                }
-                --j;
-            }
-            if (i == n) {
-                break;
-            }
-            if (j == i) {
-                auto sub = word.substr(i, 1);
-                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
-                    tokens.push_back(vocab.token_to_id.at(sub));
-                } else {
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
-                }
-                ++i;
-            }
-        }
-    }
-
-    return tokens;
-}
-
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const gpt_vocab & vocab,
-        const float * logits,
-        int    top_k,
-        double top_p,
-        double /*temp*/,
-        std::mt19937 & rng) {
-    int n_logits = vocab.id_to_token.size();
-
-    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    for (int i = 0; i < n_logits; i++) {
-        logits_id.emplace_back(logits[i], i);
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    // normalize
-    {
-        double sum = 0.0f;
-        for (int i = 0; i < (int)logits_id.size(); i++) {
-            sum += logits_id[i].first;
-        }
-
-        sum = 1.0/sum;
-        for (int i = 0; i < (int)logits_id.size(); i++) {
-            logits_id[i].first *= sum;
-        }
-    }
-
-    if (top_p < 1.0f) {
-        {
-            double cumsum = 0.0f;
-            for (int i = 0; i < top_k; i++) {
-                cumsum += logits_id[i].first;
-                if (cumsum >= top_p) {
-                    logits_id.resize(i+1);
-                    break;
-                }
-            }
-        }
-
-        // normalize again
-        {
-            double sum = 0.0f;
-            for (int i = 0; i < (int)logits_id.size(); i++) {
-                sum += logits_id[i].first;
-            }
-
-            sum = 1.0/sum;
-            for (int i = 0; i < (int)logits_id.size(); i++) {
-                logits_id[i].first *= sum;
-            }
-        }
-    }
-
-    //printf("\n");
-    //for (int i = 0; i < (int) logits_id.size(); i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
-    //}
-    //exit(0);
-
-    // sample from the obtained distribution
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    for (int i = 0; i < (int) logits_id.size(); i++) {
-        probs.push_back(logits_id[i].first);
-    }
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-}
-
 // default hparams (GPT-2 117M)
 struct gpt2_hparams {
    int32_t n_vocab = 50257;
@@ -165,7 +23,7 @@ struct gpt2_hparams {
    int32_t n_embd  = 768;
    int32_t n_head  = 12;
    int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
 };

 struct gpt2_layer {
@@ -187,7 +45,7 @@ struct gpt2_layer {
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;

-    struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency
+    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
 };

@@ -198,8 +56,9 @@ struct gpt2_model {
    struct ggml_tensor * ln_f_g;
    struct ggml_tensor * ln_f_b;

-    struct ggml_tensor * wte; // position embedding
-    struct ggml_tensor * wpe; //    token embedding
+    struct ggml_tensor * wte;     // position embedding
+    struct ggml_tensor * wpe;     //    token embedding
+    struct ggml_tensor * lm_head; // language model head

    std::vector<gpt2_layer> layers;

@@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        fin.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
        fin.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
        fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fin.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));

        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
    }

    // load vocab
@@ -268,16 +127,21 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            fin.read((char *) &len, sizeof(len));

            word.resize(len);
-            fin.read((char *) &word[0], len);
+            fin.read((char *) word.data(), len);

            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
        }
    }

-    // for the big tensors, we have the option to store the data in 16-bit floats
+    // for the big tensors, we have the option to store the data in 16-bit floats or quantized
    // in order to save memory and also to speed up the computation
-    const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+    ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+    if (wtype == GGML_TYPE_COUNT) {
+        fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+                __func__, fname.c_str(), model.hparams.ftype);
+        return false;
+    }

    auto & ctx = model.ctx;

@@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        const int n_ctx   = hparams.n_ctx;
        const int n_vocab = hparams.n_vocab;

-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g
-        ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+        ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

-        ctx_size += n_vocab*n_embd*ggml_type_size(wtype);         // wte
-        ctx_size +=   n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // wte
+        ctx_size +=   n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+        ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype);         // lm_head

-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b

-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g
-        ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+        ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b

-        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype));         // c_attn_attn_w
-        ctx_size += n_layer*(       3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b
+        ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype));         // c_attn_attn_w
+        ctx_size += n_layer*(       3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b

-        ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype));           // c_attn_proj_w
-        ctx_size += n_layer*(       n_embd*ggml_type_size(GGML_TYPE_F32));   // c_attn_proj_b
+        ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype));           // c_attn_proj_w
+        ctx_size += n_layer*(       n_embd*ggml_type_sizef(GGML_TYPE_F32));   // c_attn_proj_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_fc_w
-        ctx_size += n_layer*(       4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_fc_w
+        ctx_size += n_layer*(       4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

-        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype));         // c_mlp_proj_w
-        ctx_size += n_layer*(         n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b
+        ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype));         // c_mlp_proj_w
+        ctx_size += n_layer*(         n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k
-        ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
+        ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v

        ctx_size += (6 + 12*n_layer)*256; // object overhead

@@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = nullptr;
+        struct ggml_init_params params = {
+            .mem_size   = ctx_size,
+            .mem_buffer = NULL,
+            .no_alloc   = false,
+        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@@ -350,36 +217,38 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
        model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
        model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);

-        model.wte = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
-        model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.wte     = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);
+        model.wpe     = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+        model.lm_head = ggml_new_tensor_2d(ctx, wtype,         n_embd, n_vocab);

        // map by name
        model.tensors["model/ln_f/g"] = model.ln_f_g;
        model.tensors["model/ln_f/b"] = model.ln_f_b;

-        model.tensors["model/wte"] = model.wte;
-        model.tensors["model/wpe"] = model.wpe;
+        model.tensors["model/wte"]     = model.wte;
+        model.tensors["model/wpe"]     = model.wpe;
+        model.tensors["model/lm_head"] = model.lm_head;

        for (int i = 0; i < n_layer; ++i) {
            auto & layer = model.layers[i];

-            layer.ln_1_g             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_1_b             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_1_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

-            layer.ln_2_g             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
-            layer.ln_2_b             = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_g        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.ln_2_b        = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

-            layer.c_attn_attn_w      = ggml_new_tensor_2d(ctx, wtype,         3*n_embd, n_embd);
-            layer.c_attn_attn_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+            layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, 3*n_embd);
+            layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);

-            layer.c_attn_proj_w      = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
-            layer.c_attn_proj_b      = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype,           n_embd, n_embd);
+            layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

-            layer.c_mlp_fc_w         = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_fc_b         = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+            layer.c_mlp_fc_w    = ggml_new_tensor_2d(ctx, wtype,           n_embd, 4*n_embd);
+            layer.c_mlp_fc_b    = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);

-            layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
-            layer.c_mlp_proj_b       = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);
+            layer.c_mlp_proj_w  = ggml_new_tensor_2d(ctx, wtype,         4*n_embd, n_embd);
+            layer.c_mlp_proj_b  = ggml_new_tensor_1d(ctx, GGML_TYPE_F32,   n_embd);

            // map by name
            model.tensors["model/h" + std::to_string(i) + "/ln_1/g"]        = layer.ln_1_g;
@@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"]    = layer.c_mlp_fc_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"]    = layer.c_mlp_fc_b;

-            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w_trans;
+            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"]  = layer.c_mlp_proj_w;
            model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"]  = layer.c_mlp_proj_b;
        }
    }
@@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
    {
        size_t total_size = 0;

+        bool has_lm_head = false;
+
        while (true) {
            int32_t n_dims;
            int32_t length;
-            int32_t ftype;
+            int32_t ttype;

            fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
            fin.read(reinterpret_cast<char *>(&length), sizeof(length));
-            fin.read(reinterpret_cast<char *>(&ftype),  sizeof(ftype));
+            fin.read(reinterpret_cast<char *>(&ttype),  sizeof(ttype));

            if (fin.eof()) {
                break;
@@ -448,7 +319,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
            std::string name(length, 0);
            fin.read(&name[0], length);

-            if (model.tensors.find(name) == model.tensors.end()) {
+            if (model.tensors.find(name.data()) == model.tensors.end()) {
                fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
                return false;
            }
@@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

            if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
                fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
-                        __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+                        __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
                return false;
            }

-            const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+            // for debugging
+            if (0) {
+                printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+            }

-            if (nelements*bpe != ggml_nbytes(tensor)) {
+            const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+            if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
                fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
                        __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
                return false;
@@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

            fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));

-            //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+            // GPT-2 models share the WTE tensor as the LM head
+            if (name == "model/wte" && has_lm_head == false) {
+                memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+            }
+
+            if (name == "model/lm_head") {
+                has_lm_head = true;
+            }
+
            total_size += ggml_nbytes(tensor);
        }

@@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
-//   - embd_w:    the predicted probabilities of the next token
+//   - embd_w:    the predicted logits for the next token
 //
 bool gpt2_eval(
        const gpt2_model & model,
@@ -512,12 +396,12 @@ bool gpt2_eval(
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;

-    static size_t buf_size = 5640ull*1024*1024;
+    static size_t buf_size = 512u*1024*1024;
    static void * buf = malloc(buf_size);

    if (mem_per_token > 0 && mem_per_token*N > buf_size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
-        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
+        //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);

        // reallocate
        buf_size = buf_size_new;
@@ -528,13 +412,14 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);
-
-    struct ggml_cgraph gf = { };
+    struct ggml_cgraph gf = {};
    gf.n_threads = n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@@ -578,7 +463,7 @@ bool gpt2_eval(
        // [2304, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_attn_w),
+                    model.layers[il].c_attn_attn_w,
                    cur);

            cur = ggml_add(ctx0,
@@ -654,11 +539,13 @@ bool gpt2_eval(
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            // [n_past + N, 64, 12]
            struct ggml_tensor * V_trans =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
-                            n_embd/n_head, n_head, n_past + N),
-                        1, 2, 0, 3);
+                ggml_cpy(ctx0,
+                        ggml_permute(ctx0,
+                            ggml_reshape_3d(ctx0,
+                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                                n_embd/n_head, n_head, n_past + N),
+                            1, 2, 0, 3),
+                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));

            // KQV = transpose(V) * KQ_soft_max
            // [64, N, 12]
@@ -685,7 +572,7 @@ bool gpt2_eval(
        // [768, N]
        {
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_attn_proj_w),
+                    model.layers[il].c_attn_proj_w,
                    cur);

            cur = ggml_add(ctx0,
@@ -722,7 +609,7 @@ bool gpt2_eval(
            // cur = fc_w*cur + fc_b
            // [3072, N]
            cur = ggml_mul_mat(ctx0,
-                    ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w),
+                    model.layers[il].c_mlp_fc_w,
                    cur);

            cur = ggml_add(ctx0,
@@ -742,7 +629,7 @@ bool gpt2_eval(
            // cur = proj_w*cur + proj_b
            // [768, N]
            cur = ggml_mul_mat(ctx0,
-                    model.layers[il].c_mlp_proj_w_trans,
+                    model.layers[il].c_mlp_proj_w,
                    cur);

            cur = ggml_add(ctx0,
@@ -769,12 +656,12 @@ bool gpt2_eval(
    }

    // inpL = WTE * inpL
-    // [ 768, 50257] - model.wte
+    // [ 768, 50257] - model.lm_head
    // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.wte, inpL);
+    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);

    // logits -> probs
-    inpL = ggml_soft_max(ctx0, inpL);
+    //inpL = ggml_soft_max(ctx0, inpL);

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
@@ -788,7 +675,7 @@ bool gpt2_eval(
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);

-    // return result for just the last token
+    // return result just for the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);

--- a/examples/talk/gpt-2.h
+++ b/examples/talk/gpt-2.h
@@ -2,18 +2,12 @@

 // TODO: Change to C-style API and move to ./examples for easy reuse.

+#include "common.h"
+
 #include <vector>
 #include <map>
 #include <string>

-struct gpt_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-};
-
 struct gpt2_context;

 struct gpt2_context * gpt2_init(const char * path_model);
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@@ -31,9 +31,9 @@ endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=1500MB \
-    -s TOTAL_MEMORY=1500MB \
+    -s PTHREAD_POOL_SIZE_STRICT=0 \
+    -s INITIAL_MEMORY=2000MB \
+    -s TOTAL_MEMORY=2000MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/whisper.wasm/emscripten.cpp
+++ b/examples/whisper.wasm/emscripten.cpp
@@ -10,6 +10,12 @@ std::thread g_worker;

 std::vector<struct whisper_context *> g_contexts(4, nullptr);

+static inline int mpow2(int n) {
+    int p = 1;
+    while (p <= n) p *= 2;
+    return p/2;
+}
+
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        if (g_worker.joinable()) {
@@ -43,7 +49,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        }
    }));

-    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
+    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, int nthreads, bool translate) {
        if (g_worker.joinable()) {
            g_worker.join();
        }
@@ -66,7 +72,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        params.print_special    = false;
        params.translate        = translate;
        params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
-        params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
+        params.n_threads        = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency())));
        params.offset_ms        = 0;

        std::vector<float> pcmf32;
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@@ -40,21 +40,42 @@

            Note that the computation is quite heavy and may take a few seconds to complete.<br>
            The transcription results will be displayed in the text area below.<br><br>
-            <b>Important: your browser must support WASM SIMD instructions for this to work.</b>
+            <b>Important:</b>
+                <ul>
+                    <li>your browser must support WASM SIMD instructions for this to work</li>
+                    <li>Firefox cannot load files larger than 256 MB - use Chrome instead</li>
+                </ul>

-            <br><br><hr>
+            <b>More examples:</b>
+                <a href="https://whisper.ggerganov.com/">main</a> |
+                <a href="https://whisper.ggerganov.com/bench">bench</a> |
+                <a href="https://whisper.ggerganov.com/stream">stream</a> |
+                <a href="https://whisper.ggerganov.com/command">command</a> |
+                <a href="https://whisper.ggerganov.com/talk">talk</a> |
+
+            <hr>

            <div id="model">
-                Whisper model: <span id="model-whisper-status"></span>
+                Whisper models: <span id="model-whisper-status"></span><br><br>
                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-tiny"     onclick="loadWhisper('tiny')">tiny (75 MB)</button>
                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <button id="fetch-whisper-base"     onclick="loadWhisper('base')">base (142 MB)</button>
                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                <button id="fetch-whisper-small"    onclick="loadWhisper('small')">small (466 MB)</button>
-                <span id="fetch-whisper-progress"></span>
-
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+                <br><br>
+                Quantized models:<br><br>
+                <button id="fetch-whisper-tiny-en-q5_1"   onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-tiny-q5_1"      onclick="loadWhisper('tiny-q5_1')">tiny (Q5_1, 31 MB)</button>
+                <button id="fetch-whisper-base-en-q5_1"   onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
+                <button id="fetch-whisper-base-q5_1"      onclick="loadWhisper('base-q5_1')">base (Q5_1, 57 MB)</button>
+                <button id="fetch-whisper-small-en-q5_1"  onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
+                <button id="fetch-whisper-small-q5_1"     onclick="loadWhisper('small-q5_1')">small (Q5_1, 182 MB)</button><br>
+                <button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
+                <button id="fetch-whisper-medium-q5_0"    onclick="loadWhisper('medium-q5_0')">medium (Q5_0, 515 MB)</button>
+                <button id="fetch-whisper-large-q5_0"     onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
+                <span id="fetch-whisper-progress"></span>
            </div>

            <br>
@@ -161,6 +182,12 @@
                            <option value="yi">Yiddish</option>
                        </select>
                    </td>
+                    <!-- Slider to select number of threads between 1 and 16 -->
+                    <td>
+                        Threads:
+                        <input type="range" id="threads" name="threads" min="1" max="16" value="8" onchange="changeThreads(this.value)" />
+                        <span id="threads-value">8</span>
+                    </td>
                    <td>
                        <button onclick="onProcess(false);">Transcribe</button>
                    </td>
@@ -263,11 +290,13 @@

                Module.FS_createDataFile("/", fname, buf, true, true);

-                model_whisper = fname;
+                //model_whisper = fname;

                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';

                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
+
+                document.getElementById('model').innerHTML = 'Model fetched: ' + model_whisper;
            }

            function loadFile(event, fname) {
@@ -292,6 +321,17 @@
                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-tiny-q5_1'     ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-q5_1'     ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-q5_1'    ).style.display = 'none';
+                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
+                document.getElementById('fetch-whisper-medium-q5_0'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
+
                document.getElementById('whisper-file'          ).style.display = 'none';
                document.getElementById('model-whisper-status'  ).innerHTML = 'loaded model: ' + file.name;
            }
@@ -304,6 +344,16 @@
                    'base':     'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
                    'small':    'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
+
+                    'tiny-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
+                    'tiny-q5_1':     'https://whisper.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin',
+                    'base-en-q5_1':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
+                    'base-q5_1':     'https://whisper.ggerganov.com/ggml-model-whisper-base-q5_1.bin',
+                    'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
+                    'small-q5_1':    'https://whisper.ggerganov.com/ggml-model-whisper-small-q5_1.bin',
+                    'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
+                    'medium-q5_0':   'https://whisper.ggerganov.com/ggml-model-whisper-medium-q5_0.bin',
+                    'large-q5_0':    'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
                };

                let sizes = {
@@ -313,6 +363,16 @@
                    'base':     142,
                    'small.en': 466,
                    'small':    466,
+
+                    'tiny-en-q5_1':   31,
+                    'tiny-q5_1':      31,
+                    'base-en-q5_1':   57,
+                    'base-q5_1':      57,
+                    'small-en-q5_1':  182,
+                    'small-q5_1':     182,
+                    'medium-en-q5_0': 515,
+                    'medium-q5_0':    515,
+                    'large-q5_0':     1030,
                };

                let url     = urls[model];
@@ -327,8 +387,19 @@
                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
-                document.getElementById('whisper-file'          ).style.display = 'none';
-                document.getElementById('model-whisper-status'  ).innerHTML = 'loading model: ' + model;
+
+                document.getElementById('fetch-whisper-tiny-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-tiny-q5_1'     ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-en-q5_1'  ).style.display = 'none';
+                document.getElementById('fetch-whisper-base-q5_1'     ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
+                document.getElementById('fetch-whisper-small-q5_1'    ).style.display = 'none';
+                document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
+                document.getElementById('fetch-whisper-medium-q5_0'   ).style.display = 'none';
+                document.getElementById('fetch-whisper-large-q5_0'    ).style.display = 'none';
+
+                document.getElementById('whisper-file'        ).style.display = 'none';
+                document.getElementById('model-whisper-status').innerHTML = 'loading model: ' + model;

                cbProgress = function(p) {
                    let el = document.getElementById('fetch-whisper-progress');
@@ -337,14 +408,26 @@

                cbCancel = function() {
                    var el;
+
                    el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small'   ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('whisper-file'          ); if (el) el.style.display = 'inline-block';
-                    el = document.getElementById('model-whisper-status'  ); if (el) el.innerHTML = '';
+
+                    el = document.getElementById('fetch-whisper-tiny-en-q5_1'  ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-tiny-q5_1'     ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-en-q5_1'  ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-base-q5_1'     ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-small-q5_1'    ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-medium-q5_0'   ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('fetch-whisper-large-q5_0'    ); if (el) el.style.display = 'inline-block';
+
+                    el = document.getElementById('whisper-file'        ); if (el) el.style.display = 'inline-block';
+                    el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
                };

                loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
@@ -354,7 +437,8 @@
            // audio file
            //

-            const kMaxAudio_s = 120;
+            const kMaxAudio_s = 30*60;
+            const kMaxRecording_s = 2*60;
            const kSampleRate = 16000;

            window.AudioContext = window.AudioContext || window.webkitAudioContext;
@@ -423,7 +507,7 @@
                doRecording = false;
            }

-            // record up to kMaxAudio_s seconds of audio from the microphone
+            // record up to kMaxRecording_s seconds of audio from the microphone
            // check if doRecording is false every 1000 ms and stop recording if so
            // update progress information
            function startRecording() {
@@ -479,9 +563,9 @@
                                        printTextarea('js: audio recorded, size: ' + audio.length);

                                        // truncate to first 30 seconds
-                                        if (audio.length > kMaxAudio_s*kSampleRate) {
-                                            audio = audio.slice(0, kMaxAudio_s*kSampleRate);
-                                            printTextarea('js: truncated audio to first ' + kMaxAudio_s + ' seconds');
+                                        if (audio.length > kMaxRecording_s*kSampleRate) {
+                                            audio = audio.slice(0, kMaxRecording_s*kSampleRate);
+                                            printTextarea('js: truncated audio to first ' + kMaxRecording_s + ' seconds');
                                        }
                                        setAudio(audio);
                                    });
@@ -509,24 +593,31 @@
                        });
                    }

-                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxAudio_s) + '%';
-                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxAudio_s).toFixed(0) + '%';
+                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxRecording_s) + '%';
+                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxRecording_s).toFixed(0) + '%';
                }, 1000);

                printTextarea('js: recording ...');

                setTimeout(function() {
                    if (doRecording) {
-                        printTextarea('js: recording stopped after ' + kMaxAudio_s + ' seconds');
+                        printTextarea('js: recording stopped after ' + kMaxRecording_s + ' seconds');
                        stopRecording();
                    }
-                }, kMaxAudio_s*1000);
+                }, kMaxRecording_s*1000);
            }

            //
            // transcribe
            //

+            var nthreads = 8;
+
+            function changeThreads(value) {
+                nthreads = value;
+                document.getElementById('threads-value').innerHTML = nthreads;
+            }
+
            function onProcess(translate) {
                if (!instance) {
                    instance = Module.init('whisper.bin');
@@ -553,7 +644,7 @@
                    printTextarea('');

                    setTimeout(function() {
-                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, translate);
+                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, nthreads, translate);
                        console.log('js: full_default returned: ' + ret);
                        if (ret) {
                            printTextarea("js: whisper returned: " + ret);