whisper : add integer quantization support (#540)

* whisper : add integer quantization support * examples : add common-ggml + prepare to add "quantize" tool * whisper : quantization tool ready * whisper : fix F32 support * whisper : try to fix shared lib linkage * wasm : update quantized models to Q5 * bench.wasm : remove "medium" button * bench.wasm : fix custom model button * ggml : add Q5_0 and Q5_1 WASM SIMD * wasm : add quantized models to all WASM examples * wasm : bump DB version number to 2 * talk-llama : update example to latest llama.cpp * node : increase test timeout to 10s * readme : add information for model quantization * wasm : add links to other examples
2025-08-15 20:47:59 +02:00 · 2023-04-30 18:51:57 +03:00
parent 5fd1bdd7fc
commit 794b162a46
41 changed files with 3183 additions and 1010 deletions
--- a/examples/common.h
+++ b/examples/common.h
@ -1,10 +1,44 @@
+// Various helper functions and utilities
+
 #pragma once

-// needs to match WHISPER_SAMPLE_RATE
+#include <string>
+#include <map>
+#include <vector>
+#include <random>
+#include <thread>
+
 #define COMMON_SAMPLE_RATE 16000

-#include <vector>
-#include <string>
+//
+// CLI argument parsing
+//
+
+struct gpt_params {
+    int32_t seed      = -1; // RNG seed
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_predict = 200; // new tokens to predict
+
+    // sampling parameters
+    int32_t top_k = 40;
+    float   top_p = 0.9f;
+    float   temp  = 0.9f;
+
+    int32_t n_batch = 8; // batch size for prompt processing
+
+    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
+    std::string prompt;
+};
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+//
+// Vocab utils
+//

 std::string trim(const std::string & s);

@ -13,6 +47,52 @@ std::string replace(
        const std::string & from,
        const std::string & to);

+struct gpt_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+};
+
+// poor-man's JSON parsing
+std::map<std::string, int32_t> json_parse(const std::string & fname);
+
+// split text into tokens
+//
+// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
+//
+// Regex (Python):
+// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+//
+// Regex (C++):
+// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
+//
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
+
+// load the tokens from encoder.json
+bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
+
+// sample next token given probabilities for each embedding
+//
+//   - consider only the top K tokens
+//   - from them, consider only the top tokens with cumulative probability > P
+//
+// TODO: not sure if this implementation is correct
+// TODO: temperature is not implemented
+//
+gpt_vocab::id gpt_sample_top_k_top_p(
+        const gpt_vocab & vocab,
+        const float * logits,
+        int    top_k,
+        double top_p,
+        double temp,
+        std::mt19937 & rng);
+
+//
+// Audio utils
+//
+
 // Read WAV audio file and store the PCM data into pcmf32
 // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
 // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM