tmp : demonstrate how to measure time of ggml ops

2025-08-13 13:18:01 +02:00 · 2023-03-09 09:28:06 +02:00
39 changed files with 744 additions and 5909 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,7 +18,6 @@ build-sanitize-thread/
 /stream
 /command
 /talk
-/talk-llama
 /bench

 arm_neon.h
@ -33,5 +32,3 @@ examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata

 extra/bench-gg.txt
-
-*.mlmodel*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -172,9 +172,7 @@ else()
            if(NOT WHISPER_NO_FMA)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
            endif()
-            if(NOT WHISPER_NO_F16C)
-                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
-            endif()
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
        endif()
    endif()
 endif()
--- a/28
+++ b/28
@ -34,12 +34,6 @@ CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
 CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

-# ref: https://github.com/ggerganov/whisper.cpp/issues/37
-ifneq ($(wildcard /usr/include/musl/*),)
-	CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
-	CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
-endif
-
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@ -151,15 +145,12 @@ ifneq ($(filter aarch64%,$(UNAME_M)),)
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
-	# 32-bit Raspberry Pi 1, 2, 3
-	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access
+	# Raspberry Pi 1, 2, 3
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
 endif
 ifneq ($(filter armv7%,$(UNAME_M)),)
-	# 32-bit ARM, for example on Armbian or possibly raspbian
-	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
-	
-	# 64-bit ARM, use these (TODO: auto-detect 64-bit)
-	# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	# Raspberry Pi 4
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 4
@ -181,7 +172,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )

-default: main bench
+default: main

 #
 # Build library
@ -200,7 +191,7 @@ libwhisper.so: ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)

 clean:
-	rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk bench libwhisper.a libwhisper.so

 #
 # Examples
@ -215,9 +206,6 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
 	./main -h

-bench: examples/bench/bench.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
-
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)

@ -227,8 +215,8 @@ command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whi
 talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)

-talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk-llama $(CC_SDL) $(LDFLAGS)
+bench: examples/bench/bench.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)

 #
 # Audio samples
--- a/README.md
+++ b/README.md
@ -313,7 +313,7 @@ whisper_print_timings:    total time = 32733.52 ms
 ## Real-time audio input example

 This is a naive example of performing real-time inference on audio from your microphone.
-The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
+The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continously.
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

 ```java
@ -466,7 +466,7 @@ The original models are converted to a custom binary format. This allows to pack
 You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
 or manually from here:

- https://huggingface.co/ggerganov/whisper.cpp
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

 For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
@ -476,7 +476,6 @@ in [models](models).

 - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
 - [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
-  - React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
 - [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
 - [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
@ -486,7 +485,6 @@ in [models](models).
 - [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)

 ## Examples

@ -500,7 +498,6 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
 | [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
 | [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
 | [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
-| [talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
 | [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
 | [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
 | [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -17,9 +17,9 @@ import (
 // CONSTANTS

 const (
-	srcUrl  = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models
-	srcExt  = ".bin"                                                      // Filename extension
-	bufSize = 1024 * 64                                                   // Size of the buffer used for downloading the model
+	srcUrl  = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main" // The location of the models
+	srcExt  = ".bin"                                                               // Filename extension
+	bufSize = 1024 * 64                                                            // Size of the buffer used for downloading the model
 )

 var (
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -63,5 +63,4 @@ else()
    add_subdirectory(command)
    add_subdirectory(bench)
    add_subdirectory(talk)
-    add_subdirectory(talk-llama)
 endif()
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -1,22 +1,15 @@
-const path = require("path");
-const { whisper } = require(path.join(
-  __dirname,
-  "../../../build/Release/whisper-addon"
-));
-const { promisify } = require("util");
-
-const whisperAsync = promisify(whisper);
+const path = require('path');
+const { whisper } = require(path.join(__dirname, '../../../build/Release/whisper-addon'));

 const whisperParamsMock = {
-  language: "en",
-  model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
-  fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
+    language: 'en',
+    model: path.join(__dirname, '../../../models/ggml-base.en.bin'),
+    fname_inp: path.join(__dirname, '../../../samples/jfk.wav'),
 };

 describe("Run whisper.node", () => {
-  test("it should receive a non-empty value", async () => {
-    let result = await whisperAsync(whisperParamsMock);

-    expect(result.length).toBeGreaterThan(0);
-  });
+    test("it should receive a non-empty value", () => {
+        expect(whisper(whisperParamsMock).length).toBeGreaterThan(0);
+    });
 });
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -160,6 +160,22 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
        return 3;
    }

+    // initial prompt
+    std::vector<whisper_token> prompt_tokens;
+
+    if (!params.prompt.empty()) {
+        prompt_tokens.resize(1024);
+        prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
+
+        fprintf(stderr, "\n");
+        fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
+        fprintf(stderr, "initial tokens: [ ");
+        for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
+            fprintf(stderr, "%d ", prompt_tokens[i]);
+        }
+        fprintf(stderr, "]\n");
+    }
+
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
        const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@ -227,7 +243,8 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;

-            wparams.initial_prompt   = params.prompt.c_str();
+            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();

            whisper_print_user_data user_data = { &params, &pcmf32s };

@ -275,64 +292,51 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    return 0;
 }

-class Worker : public Napi::AsyncWorker {
- public:
-  Worker(Napi::Function& callback, whisper_params params)
-      : Napi::AsyncWorker(callback), params(params) {}
-
-  void Execute() override {
-    run(params, result);
-  }
-
-  void OnOK() override {
-    Napi::HandleScope scope(Env());
-    Napi::Object res = Napi::Array::New(Env(), result.size());
-    for (uint64_t i = 0; i < result.size(); ++i) {
-      Napi::Object tmp = Napi::Array::New(Env(), 3);
-      for (uint64_t j = 0; j < 3; ++j) {
-        tmp[j] = Napi::String::New(Env(), result[i][j]);
-      }
-      res[i] = tmp;
+Napi::Object whisper(const Napi::CallbackInfo& info) {
+    Napi::Env env = info.Env();
+    if (info.Length() <= 0 || !info[0].IsObject()) {
+        Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
    }
-    Callback().Call({Env().Null(), res});
-  }
+    whisper_params params;
+    std::vector<std::vector<std::string>> result;

- private:
-  whisper_params params;
-  std::vector<std::vector<std::string>> result;
-};
+    Napi::Object whisper_params = info[0].As<Napi::Object>();
+    std::string language = whisper_params.Get("language").As<Napi::String>();
+    std::string model = whisper_params.Get("model").As<Napi::String>();
+    std::string input = whisper_params.Get("fname_inp").As<Napi::String>();

+    params.language = language;
+    params.model = model;
+    params.fname_inp.emplace_back(input);

+    // run model
+    run(params, result);

-Napi::Value whisper(const Napi::CallbackInfo& info) {
-  Napi::Env env = info.Env();
-  if (info.Length() <= 0 || !info[0].IsObject()) {
-    Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
-  }
-  whisper_params params;
+    fprintf(stderr, "RESULT:\n");
+    for (auto sentence:result) {
+        fprintf(stderr, "t0: %s, t1: %s, content: %s \n",
+                sentence[0].c_str(), sentence[1].c_str(), sentence[2].c_str());
+    }

-  Napi::Object whisper_params = info[0].As<Napi::Object>();
-  std::string language = whisper_params.Get("language").As<Napi::String>();
-  std::string model = whisper_params.Get("model").As<Napi::String>();
-  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
+    Napi::Object res = Napi::Array::New(env, result.size());
+    for (uint64_t i = 0; i < result.size(); ++i) {
+        Napi::Object tmp = Napi::Array::New(env, 3);
+        for (uint64_t j = 0; j < 3; ++j) {
+            tmp[j] = Napi::String::New(env, result[i][j]);
+        }
+        res[i] = tmp;
+    }

-  params.language = language;
-  params.model = model;
-  params.fname_inp.emplace_back(input);
-
-  Napi::Function callback = info[1].As<Napi::Function>();
-  Worker* worker = new Worker(callback, params);
-  worker->Queue();
-  return env.Undefined();
+    return res;
 }


 Napi::Object Init(Napi::Env env, Napi::Object exports) {
-  exports.Set(
-      Napi::String::New(env, "whisper"),
-      Napi::Function::New(env, whisper)
-  );
-  return exports;
+    exports.Set(
+            Napi::String::New(env, "whisper"),
+            Napi::Function::New(env, whisper)
+    );
+    return exports;
 }

 NODE_API_MODULE(whisper, Init);
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -1,36 +1,27 @@
-const path = require("path");
-const { whisper } = require(path.join(
-  __dirname,
-  "../../build/Release/whisper-addon"
-));
-const { promisify } = require("util");
-
-const whisperAsync = promisify(whisper);
+const path = require('path');
+const { whisper } = require(path.join(__dirname, '../../build/Release/whisper-addon'));

 const whisperParams = {
-  language: "en",
-  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
-  fname_inp: "../../samples/jfk.wav",
+    language: 'en',
+    model: path.join(__dirname, '../../models/ggml-base.en.bin'),
+    fname_inp: '',
 };

 const arguments = process.argv.slice(2);
 const params = Object.fromEntries(
-  arguments.reduce((pre, item) => {
-    if (item.startsWith("--")) {
-      return [...pre, item.slice(2).split("=")];
-    }
-    return pre;
-  }, [])
+    arguments.reduce((pre, item) => {
+        if (item.startsWith("--")) {
+            return [...pre, item.slice(2).split("=")];
+        }
+        return pre;
+    }, []),
 );

 for (const key in params) {
-  if (whisperParams.hasOwnProperty(key)) {
-    whisperParams[key] = params[key];
-  }
+    if (whisperParams.hasOwnProperty(key)) {
+        whisperParams[key] = params[key];
+    }
 }

-console.log("whisperParams =", whisperParams);
-
-whisperAsync(whisperParams).then((result) => {
-  console.log(`Result from whisper: ${result}`);
-});
+console.log('whisperParams =', whisperParams);
+console.log(whisper(whisperParams));
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -31,7 +31,6 @@ options:
  -osrt,     --output-srt        [false  ] output result in a srt file
  -owts,     --output-words      [false  ] output script for generating karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
-  -oj,       --output-json       [false  ] output result in a JSON file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -8,7 +8,6 @@
 #include <string>
 #include <thread>
 #include <vector>
-#include <cstring>

 // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
 // Lowest is red, middle is yellow, highest is green.
@ -74,7 +73,6 @@ struct whisper_params {
    bool output_srt     = false;
    bool output_wts     = false;
    bool output_csv     = false;
-    bool output_jsn     = false;
    bool print_special  = false;
    bool print_colors   = false;
    bool print_progress = false;
@ -132,7 +130,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
        else if (arg == "-fp"   || arg == "--font-path")      { params.font_path      = argv[++i]; }
        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
-        else if (arg == "-oj"   || arg == "--output-json")    { params.output_jsn     = true; }
        else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
@ -181,7 +178,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
-    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
@ -372,164 +368,6 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
    return true;
 }

-char *escape_double_quotes(const char *str) {
-    if (str == NULL) {
-        return NULL;
-    }
-
-    size_t escaped_length = strlen(str) + 1;
-
-    for (size_t i = 0; str[i] != '\0'; i++) {
-        if (str[i] == '"') {
-            escaped_length++;
-        }
-    }
-
-    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
-    if (escaped == NULL) {
-        return NULL;
-    }
-
-    size_t pos = 0;
-    for (size_t i = 0; str[i] != '\0'; i++) {
-        if (str[i] == '"') {
-            escaped[pos++] = '\\';
-            escaped[pos++] = '"';
-        } else {
-            escaped[pos++] = str[i];
-        }
-    }
-
-    // no need to set zero due to calloc() being used prior
-
-    return escaped;
-}
-
-bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
-    std::ofstream fout(fname);
-    int indent = 0;
-
-    auto doindent = [&]() {
-        for (int i = 0; i < indent; i++) fout << "\t";
-    };
-
-    auto start_arr = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": [\n";
-        indent++;
-    };
-
-    auto end_arr = [&](bool end = false) {
-        indent--;
-        doindent();
-        fout << (end ? "]\n" : "},\n");
-    };
-
-    auto start_obj = [&](const char *name = nullptr) {
-        doindent();
-        if (name) {
-            fout << "\"" << name << "\": {\n";
-        } else {
-            fout << "{\n";
-        }
-        indent++;
-    };
-
-    auto end_obj = [&](bool end = false) {
-        indent--;
-        doindent();
-        fout << (end ? "}\n" : "},\n");
-    };
-
-    auto start_value = [&](const char *name) {
-        doindent();
-        fout << "\"" << name << "\": ";
-    };
-
-    auto value_s = [&](const char *name, const char *val, bool end = false) {
-        start_value(name);
-        char * val_escaped = escape_double_quotes(val);
-        fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
-        free(val_escaped);
-    };
-
-    auto end_value = [&](bool end = false) {
-        fout << (end ? "\n" : ",\n");
-    };
-
-    auto value_i = [&](const char *name, const int64_t val, bool end = false) {
-        start_value(name);
-        fout << val;
-        end_value(end);
-    };
-
-    auto value_b = [&](const char *name, const bool val, bool end = false) {
-        start_value(name);
-        fout << (val ? "true" : "false");
-        end_value(end);
-    };
-
-    if (!fout.is_open()) {
-        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
-        return false;
-    }
-
-    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-    start_obj();
-        value_s("systeminfo", whisper_print_system_info());
-        start_obj("model");
-            value_s("type", whisper_model_type_readable(ctx));
-            value_b("multilingual", whisper_is_multilingual(ctx));
-            value_i("vocab", whisper_model_n_vocab(ctx));
-            start_obj("audio");
-                value_i("ctx", whisper_model_n_audio_ctx(ctx));
-                value_i("state", whisper_model_n_audio_state(ctx));
-                value_i("head", whisper_model_n_audio_head(ctx));
-                value_i("layer", whisper_model_n_audio_layer(ctx), true);
-            end_obj();
-            start_obj("text");
-                value_i("ctx", whisper_model_n_text_ctx(ctx));
-                value_i("state", whisper_model_n_text_state(ctx));
-                value_i("head", whisper_model_n_text_head(ctx));
-                value_i("layer", whisper_model_n_text_layer(ctx), true);
-            end_obj();
-            value_i("mels", whisper_model_n_mels(ctx));
-            value_i("f16", whisper_model_f16(ctx), true);
-        end_obj();
-        start_obj("params");
-            value_s("model", params.model.c_str());
-            value_s("language", params.language.c_str());
-            value_b("translate", params.translate, true);
-        end_obj();
-        start_obj("result");
-            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
-        end_obj();
-        start_arr("transcription");
-
-            const int n_segments = whisper_full_n_segments(ctx);
-            for (int i = 0; i < n_segments; ++i) {
-                const char * text = whisper_full_get_segment_text(ctx, i);
-                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
-                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
-
-                start_obj();
-                    start_obj("timestamps");
-                        value_s("from", to_timestamp(t0, true).c_str());
-                        value_s("to", to_timestamp(t1, true).c_str(), true);
-                    end_obj();
-                    start_obj("offsets");
-                        value_i("from", t0 * 10);
-                        value_i("to", t1 * 10, true);
-                    end_obj();
-                    value_s("text", text, true);
-                end_obj(i == (n_segments - 1));
-            }
-
-        end_arr(true);
-    end_obj(true);
-    return true;
-}
-
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
@ -675,6 +513,22 @@ int main(int argc, char ** argv) {
        return 3;
    }

+    // initial prompt
+    std::vector<whisper_token> prompt_tokens;
+
+    if (!params.prompt.empty()) {
+        prompt_tokens.resize(1024);
+        prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
+
+        fprintf(stderr, "\n");
+        fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
+        fprintf(stderr, "initial tokens: [ ");
+        for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
+            fprintf(stderr, "%d ", prompt_tokens[i]);
+        }
+        fprintf(stderr, "]\n");
+    }
+
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
 		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@ -738,7 +592,8 @@ int main(int argc, char ** argv) {

            wparams.speed_up         = params.speed_up;

-            wparams.initial_prompt   = params.prompt.c_str();
+            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();

            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;
@ -807,12 +662,6 @@ int main(int argc, char ** argv) {
                const auto fname_csv = fname_out + ".csv";
                output_csv(ctx, fname_csv.c_str());
            }
-
-            // output to JSON file
-            if (params.output_jsn) {
-                const auto fname_jsn = fname_out + ".json";
-                output_json(ctx, fname_jsn.c_str(), params);
-            }
        }
    }

--- a/examples/talk-llama/.gitignore
+++ b/examples/talk-llama/.gitignore
@ -1,2 +0,0 @@
-eleven-labs.py
-audio.mp3
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -1,16 +0,0 @@
-if (WHISPER_SUPPORT_SDL2)
-    # talk-llama
-    set(TARGET talk-llama)
-    #add_executable(${TARGET} talk-llama.cpp llama.cpp)
-    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
-    #target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-
-    # TODO: this is temporary
-    #       need to export ggml symbols for MSVC, but too lazy ..
-    add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
-
-    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
-    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
-
-    include(DefaultTargetOptions)
-endif ()
--- a/examples/talk-llama/README.md
+++ b/examples/talk-llama/README.md
@ -1,36 +0,0 @@
-# talk-llama
-
-Talk with an LLaMA AI in your terminal
-
-[Demo Talk](https://user-images.githubusercontent.com/1991296/228024237-848f998c-c334-46a6-bef8-3271590da83b.mp4)
-
-## Building
-
-The `talk-llama` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
-
-```bash
-# Install SDL2 on Linux
-sudo apt-get install libsdl2-dev
-
-# Install SDL2 on Mac OS
-brew install sdl2
-
-# Build the "talk-llama" executable
-make talk-llama
-
-# Run it
-./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin -p "Georgi" -t 8
-```
-
- The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
- The `-ml` argument specifies the LLaMA model that you would like to use. Read the instructions in https://github.com/ggerganov/llama.cpp for information about how to obtain a `ggml` compatible LLaMA model
-
-## TTS
-
-For best experience, this example needs a TTS tool to convert the generated text responses to voice.
-You can use any TTS engine that you would like - simply edit the [speak.sh](speak.sh) script to your needs.
-By default, it is configured to use MacOS's `say`, but you can use whatever you wish.
-
-## Discussion
-
-If you have any feedback, please let "us" know in the following discussion: https://github.com/ggerganov/whisper.cpp/discussions/672?converting=1
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -1,152 +0,0 @@
-#ifndef LLAMA_H
-#define LLAMA_H
-
-#include <stddef.h>
-#include <stdint.h>
-#include <stdbool.h>
-
-#ifdef LLAMA_SHARED
-#    if defined(_WIN32) && !defined(__MINGW32__)
-#        ifdef LLAMA_BUILD
-#            define LLAMA_API __declspec(dllexport)
-#        else
-#            define LLAMA_API __declspec(dllimport)
-#        endif
-#    else
-#        define LLAMA_API __attribute__ ((visibility ("default")))
-#    endif
-#else
-#    define LLAMA_API
-#endif
-
-#define LLAMA_FILE_VERSION 1
-#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
-#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    //
-    // C interface
-    //
-    // TODO: show sample usage
-    //
-
-    struct llama_context;
-
-    typedef int llama_token;
-
-    typedef struct llama_token_data {
-        llama_token id;  // token id
-
-        float p;     // probability of the token
-        float plog;  // log probability of the token
-
-    } llama_token_data;
-
-    typedef void (*llama_progress_callback)(float progress, void *ctx);
-
-    struct llama_context_params {
-        int n_ctx;   // text context
-        int n_parts; // -1 for default
-        int seed;    // RNG seed, 0 for random
-
-        bool f16_kv;     // use fp16 for KV cache
-        bool logits_all; // the llama_eval() call computes all logits, not just the last one
-        bool vocab_only; // only load the vocabulary, no weights
-        bool use_mlock;  // force system to keep model in RAM
-        bool embedding;  // embedding mode only
-
-        // called with a progress value between 0 and 1, pass NULL to disable
-        llama_progress_callback progress_callback;
-        // context pointer passed to the progress callback
-        void * progress_callback_user_data;
-    };
-
-    LLAMA_API struct llama_context_params llama_context_default_params();
-
-    // Various functions for loading a ggml llama model.
-    // Allocate (almost) all memory needed for the model.
-    // Return NULL on failure
-    LLAMA_API struct llama_context * llama_init_from_file(
-                             const char * path_model,
-            struct llama_context_params   params);
-
-    // Frees all allocated memory
-    LLAMA_API void llama_free(struct llama_context * ctx);
-
-    // TODO: not great API - very likely to change
-    // Returns 0 on success
-    LLAMA_API int llama_model_quantize(
-            const char * fname_inp,
-            const char * fname_out,
-                   int   itype);
-
-    // Run the llama inference to obtain the logits and probabilities for the next token.
-    // tokens + n_tokens is the provided batch of new tokens to process
-    // n_past is the number of tokens to use from previous eval calls
-    // Returns 0 on success
-    LLAMA_API int llama_eval(
-            struct llama_context * ctx,
-               const llama_token * tokens,
-                             int   n_tokens,
-                             int   n_past,
-                             int   n_threads);
-
-    // Convert the provided text into tokens.
-    // The tokens pointer must be large enough to hold the resulting tokens.
-    // Returns the number of tokens on success, no more than n_max_tokens
-    // Returns a negative number on failure - the number of tokens that would have been returned
-    // TODO: not sure if correct
-    LLAMA_API int llama_tokenize(
-            struct llama_context * ctx,
-                      const char * text,
-                     llama_token * tokens,
-                             int   n_max_tokens,
-                            bool   add_bos);
-
-    LLAMA_API int llama_n_vocab(struct llama_context * ctx);
-    LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
-    LLAMA_API int llama_n_embd (struct llama_context * ctx);
-
-    // Token logits obtained from the last call to llama_eval()
-    // The logits for the last token are stored in the last row
-    // Can be mutated in order to change the probabilities of the next token
-    // Rows: n_tokens
-    // Cols: n_vocab
-    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
-
-    // Get the embeddings for the input
-    // shape: [n_embd] (1-dimensional)
-    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
-
-    // Token Id -> String. Uses the vocabulary in the provided context
-    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
-
-    // Special tokens
-    LLAMA_API llama_token llama_token_bos();
-    LLAMA_API llama_token llama_token_eos();
-
-    // TODO: improve the last_n_tokens interface ?
-    LLAMA_API llama_token llama_sample_top_p_top_k(
-       struct llama_context * ctx,
-          const llama_token * last_n_tokens_data,
-                        int   last_n_tokens_size,
-                        int   top_k,
-                      float   top_p,
-                      float   temp,
-                      float   repeat_penalty);
-
-    // Performance information
-    LLAMA_API void llama_print_timings(struct llama_context * ctx);
-    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
-
-    // Print system information
-    LLAMA_API const char * llama_print_system_info(void);
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif
--- a/examples/talk-llama/prompts/talk-alpaca.txt
+++ b/examples/talk-llama/prompts/talk-alpaca.txt
@ -1,23 +0,0 @@
-Below is an instruction that describes a task. Write a response that appropriately completes the request.
-
-### Instruction:
-
-Write a text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
-{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
-There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{1} responds with short and concise answers.
-
-### Response:
-
-{0}{4} Hello, {1}!
-{1}{4} Hello {0}! How may I help you today?
-{0}{4} What time is it?
-{1}{4} It is {2} o'clock.
-{0}{4} What year is it?
-{1}{4} We are in {3}.
-{0}{4} What is a cat?
-{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
-{0}{4} Name a color.
-{1}{4} Blue
-{0}{4}
--- a/examples/talk-llama/speak.sh
+++ b/examples/talk-llama/speak.sh
@ -1,28 +0,0 @@
-#!/bin/bash
-
-# Usage:
-#  speak.sh <voice_id> <text-to-speak>
-
-# espeak
-# Mac OS: brew install espeak
-# Linux: apt-get install espeak
-#
-#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
-
-# for Mac
-if [ "$1" = "0" ]; then
-    say "$2"
-elif [ "$1" = "1" ]; then
-    say -v "Samantha (Enhanced)" "$2"
-elif [ "$1" = "2" ]; then
-    say -v "Daniel (Enhanced)" "$2"
-elif [ "$1" = "3" ]; then
-    say -v "Veena (Enhanced)" "$2"
-fi
-
-# Eleven Labs
-#
-#wd=$(dirname $0)
-#script=$wd/eleven-labs.py
-#python3 $script $1 "$2" >/dev/null 2>&1
-#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -1,703 +0,0 @@
-// Talk with AI
-//
-
-#include "common.h"
-#include "common-sdl.h"
-#include "whisper.h"
-#include "llama.h"
-
-#include <map>
-#include <cassert>
-#include <cstdio>
-#include <fstream>
-#include <regex>
-#include <string>
-#include <thread>
-#include <vector>
-#include <regex>
-
-std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
-    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
-    std::vector<llama_token> res(text.size() + (int)add_bos);
-    int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
-    assert(n >= 0);
-    res.resize(n);
-
-    return res;
-}
-
-// command-line parameters
-struct whisper_params {
-    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t voice_id   = 0;
-    int32_t voice_ms   = 10000;
-    int32_t capture_id = -1;
-    int32_t max_tokens = 64;
-    int32_t audio_ctx  = 0;
-
-    int32_t n_parts_llama = -1;
-
-    float vad_thold    = 0.4f;
-    float freq_thold   = 100.0f;
-
-    bool speed_up      = false;
-    bool translate     = false;
-    bool print_special = false;
-    bool print_energy  = false;
-    bool no_timestamps = true;
-    bool verbose_prompt = false;
-
-    std::string name_ni     = "Georgi"; // natural    intelligence
-    std::string name_ai     = "LLaMA";  // artificial intelligence
-    std::string language    = "en";
-    std::string model_wsp   = "models/ggml-base.en.bin";
-    std::string model_llama = "models/ggml-llama-7B.bin";
-    std::string speak       = "./examples/talk/speak.sh";
-    std::string prompt      = "";
-    std::string fname_out;
-};
-
-void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
-
-bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
-    for (int i = 1; i < argc; i++) {
-        std::string arg = argv[i];
-
-        if (arg == "-h" || arg == "--help") {
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-        else if (arg == "-t"   || arg == "--threads")       { params.n_threads      = std::stoi(argv[++i]); }
-        else if (arg == "-vid" || arg == "--voice-id")      { params.voice_id       = std::stoi(argv[++i]); }
-        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms       = std::stoi(argv[++i]); }
-        else if (arg == "-c"   || arg == "--capture")       { params.capture_id     = std::stoi(argv[++i]); }
-        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens     = std::stoi(argv[++i]); }
-        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx      = std::stoi(argv[++i]); }
-        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold      = std::stof(argv[++i]); }
-        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold     = std::stof(argv[++i]); }
-        else if (arg == "--n-parts-llama")                  { params.n_parts_llama  = std::stoi(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up       = true; }
-        else if (arg == "-tr"  || arg == "--translate")     { params.translate      = true; }
-        else if (arg == "-ps"  || arg == "--print-special") { params.print_special  = true; }
-        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy   = true; }
-        else if (arg == "--verbose-prompt")                 { params.verbose_prompt = true; }
-        else if (arg == "-nni" || arg == "--name-ni")       { params.name_ni        = argv[++i]; }
-        else if (arg == "-nai" || arg == "--name-ai")       { params.name_ai        = argv[++i]; }
-        else if (arg == "-l"   || arg == "--language")      { params.language       = argv[++i]; }
-        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp      = argv[++i]; }
-        else if (arg == "-ml"  || arg == "--model-llama")   { params.model_llama    = argv[++i]; }
-        else if (arg == "-s"   || arg == "--speak")         { params.speak          = argv[++i]; }
-        else if (arg == "--prompt-file")                    {
-            std::ifstream file(argv[++i]);
-            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
-            if (params.prompt.back() == '\n') {
-                params.prompt.pop_back();
-            }
-        }
-        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
-        else {
-            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
-            whisper_print_usage(argc, argv, params);
-            exit(0);
-        }
-    }
-
-    return true;
-}
-
-void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
-    fprintf(stderr, "\n");
-    fprintf(stderr, "usage: %s [options]\n", argv[0]);
-    fprintf(stderr, "\n");
-    fprintf(stderr, "options:\n");
-    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
-    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
-    fprintf(stderr, "  -vid N,   --voice-id N    [%-7d] voice ID\n",                                    params.voice_id);
-    fprintf(stderr, "  -vms N,   --voice-ms N    [%-7d] voice duration in milliseconds\n",              params.voice_ms);
-    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
-    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
-    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
-    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
-    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
-    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
-    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
-    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
-    fprintf(stderr, "  -nni NAME,--name-ni NAME  [%-7s] natural intelligence name\n",                   params.name_ni.c_str());
-    fprintf(stderr, "  -nai NAME,--name-ai NAME  [%-7s] artificial intelligence name\n",                params.name_ai.c_str());
-    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
-    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
-    fprintf(stderr, "  -ml FILE, --model-llama   [%-7s] llama model file\n",                            params.model_llama.c_str());
-    fprintf(stderr, "  --n-parts-llama N         [%-7d] num parts in llama model file\n",               params.n_parts_llama);
-    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
-    fprintf(stderr, "  --prompt-file FNAME       [%-7s] file with custom prompt to start dialog\n",     "");
-    fprintf(stderr, "  --verbose-prompt          [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
-    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
-    fprintf(stderr, "\n");
-}
-
-std::string transcribe(
-        whisper_context * ctx,
-        const whisper_params & params,
-        const std::vector<float> & pcmf32,
-        const std::string prompt_text,
-        float & prob,
-        int64_t & t_ms) {
-    const auto t_start = std::chrono::high_resolution_clock::now();
-
-    prob = 0.0f;
-    t_ms = 0;
-
-    std::vector<whisper_token> prompt_tokens;
-
-    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
-
-    prompt_tokens.resize(1024);
-    prompt_tokens.resize(whisper_tokenize(ctx, prompt_text.c_str(), prompt_tokens.data(), prompt_tokens.size()));
-
-    wparams.print_progress   = false;
-    wparams.print_special    = params.print_special;
-    wparams.print_realtime   = false;
-    wparams.print_timestamps = !params.no_timestamps;
-    wparams.translate        = params.translate;
-    wparams.no_context       = true;
-    wparams.single_segment   = true;
-    wparams.max_tokens       = params.max_tokens;
-    wparams.language         = params.language.c_str();
-    wparams.n_threads        = 2;
-
-    wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
-    wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();
-
-    wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
-
-    static int iter = params.voice_id;
-    std::this_thread::sleep_for(std::chrono::milliseconds(100*iter));
-    iter = (iter + 1) % 4;
-
-    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
-        return "";
-    }
-
-    int prob_n = 0;
-    std::string result;
-
-    const int n_segments = whisper_full_n_segments(ctx);
-    for (int i = 0; i < n_segments; ++i) {
-        const char * text = whisper_full_get_segment_text(ctx, i);
-
-        result += text;
-
-        const int n_tokens = whisper_full_n_tokens(ctx, i);
-        for (int j = 0; j < n_tokens; ++j) {
-            const auto token = whisper_full_get_token_data(ctx, i, j);
-
-            prob += token.p;
-            ++prob_n;
-        }
-    }
-
-    if (prob_n > 0) {
-        prob /= prob_n;
-    }
-
-    const auto t_end = std::chrono::high_resolution_clock::now();
-    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
-
-    return result;
-}
-
-const std::vector<std::string> k_participants = {
-    "LLaMA",
-    "GGaMA",
-    "SSaMA",
-    "RRaMA",
-};
-
-// homophones
-const std::map<std::string, std::vector<std::string>> k_homophones = {
-    { "LLaMA", { "llama", "Llama", "LLAMA", }, },
-    { "GGaMA", { "gama", "Gama", "GAMA", "gamma", "Gamma", "GAMMA", }, },
-    { "SSaMA", { "sama", "Sama", "SAMA", "samma", "Samma", "SAMMA", }, },
-    { "RRaMA", { "rama", "Rama", "RAMA", "ramma", "Ramma", "RAMMA", }, },
-};
-
-const std::string k_prompt_whisper = R"(A conversation between {1}, {10}, {11}, {12} and {13}.)";
-
-const std::map<std::string, std::string> k_prompt = {
-    {
-        k_participants.at(0),
-        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
-There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{10}, {11}, {12} and {13} respond with short and concise answers.
-{10} is smart, objective, honest and kind. Never fails to give a meaningful and insightful answer and opinion.
-{1} is leading the conversation and asking the questions.
-
-{1}{4} Hello {10}! What is your opinion on the current state of the world?
-{10}{4} Great question {1}! I think we live in a very interesting time.
-There are many things to be concerned about, but also many things to be optimistic about.
-{1}{4} What advice would you give to a young person who is just starting out in life?
-{10}{4} I would tell them to be patient and to not be afraid to fail.
-It is important to learn from your mistakes and to keep trying.
-{1}{4})"
-    },
-    {
-        k_participants.at(1),
-        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
-There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{10}, {11}, {12} and {13} respond with short and concise answers.
-{11} has critical thinking skills, is very knowledgeable and is a good listener. He is very humble and never arrogant.
-{1} is leading the conversation and asking the questions.
-
-{1}{4} Hello {11}! What is your opinion on the current state of the world?
-{11}{4} The world is about to experience a major change. We are on the verge of a new era.
-{1}{4} What advice would you give to a young person who is just starting out in life?
-{11}{4} My advice would be to be open minded and to be willing to learn from others.
-{1}{4})"
-    },
-    {
-        k_participants.at(2),
-        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
-There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{10}, {11}, {12} and {13} respond with short and concise answers.
-{12} has strong leadership skills, strategic thinking, and innovative ideas. Has the ability to mentor and support young people.
-{1} is leading the conversation and asking the questions.
-
-{1}{4} Hello {12}! What is your opinion on the current state of the world?
-{12}{4} Our future is bright. We are living in a time of great opportunity.
-{1}{4} What advice would you give to a young person who is just starting out in life?
-{12}{4} I would tell them to be brave and to be willing to take risks.
-{1}{4})"
-    },
-    {
-        k_participants.at(3),
-        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
-There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-{10}, {11}, {12} and {13} respond with short and concise answers.
-{13} is rude, arrogant, and has a bad attitude. He is very opinionated and never listens to others.
-{1} is leading the conversation and asking the questions.
-
-{1}{4} Hello {13}! What is your opinion on the current state of the world?
-{13}{4} The world is a terrible place. It is full of evil and corruption.
-{1}{4} What advice would you give to a young person who is just starting out in life?
-{13}{4} I would tell them to be selfish and to never trust anyone.
-{1}{4})"
-    },
-};
-
-int main(int argc, char ** argv) {
-    whisper_params params;
-
-    if (whisper_params_parse(argc, argv, params) == false) {
-        return 1;
-    }
-
-    if (whisper_lang_id(params.language.c_str()) == -1) {
-        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
-        whisper_print_usage(argc, argv, params);
-        exit(0);
-    }
-
-    // whisper init
-
-    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
-
-    // llama init
-
-    auto lparams = llama_context_default_params();
-
-    // tune these to your liking
-    lparams.n_ctx      = 512;
-    lparams.seed       = 1;
-    lparams.f16_kv     = true;
-    lparams.n_parts    = params.n_parts_llama;
-
-    struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
-
-    // print some info about the processing
-    {
-        fprintf(stderr, "\n");
-
-        if (!whisper_is_multilingual(ctx_wsp)) {
-            if (params.language != "en" || params.translate) {
-                params.language = "en";
-                params.translate = false;
-                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
-            }
-        }
-        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
-                __func__,
-                params.n_threads,
-                params.language.c_str(),
-                params.translate ? "translate" : "transcribe",
-                params.no_timestamps ? 0 : 1);
-
-        fprintf(stderr, "\n");
-    }
-
-
-    // init audio
-
-    audio_async audio(30*1000);
-    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
-        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
-        return 1;
-    }
-
-    audio.resume();
-
-    int n_iter = 0;
-
-    bool is_running  = true;
-    bool force_speak = false;
-
-    float prob0 = 0.0f;
-
-    const std::string chat_symb = ":";
-
-    const std::string name_ni  = params.name_ni;
-    const std::string name_ai  = params.name_ai;
-
-    // the participant that was referenced last
-    std::string name_ref = name_ni;
-
-    std::vector<float> pcmf32_cur;
-    std::vector<float> pcmf32_prompt;
-
-    std::string prompt_whisper = k_prompt_whisper;
-
-    prompt_whisper = ::replace(prompt_whisper, "{1}",  name_ni);
-    prompt_whisper = ::replace(prompt_whisper, "{10}", k_participants.at(0));
-    prompt_whisper = ::replace(prompt_whisper, "{11}", k_participants.at(1));
-    prompt_whisper = ::replace(prompt_whisper, "{12}", k_participants.at(2));
-    prompt_whisper = ::replace(prompt_whisper, "{13}", k_participants.at(3));
-
-    // construct the initial prompt for LLaMA inference
-    std::string prompt_llama = params.prompt.empty() ? k_prompt.find(name_ai)->second : params.prompt;
-
-    // need to have leading ' '
-    prompt_llama.insert(0, 1, ' ');
-
-    prompt_llama = ::replace(prompt_llama, "{1}",  name_ni);
-    prompt_llama = ::replace(prompt_llama, "{10}", k_participants.at(0));
-    prompt_llama = ::replace(prompt_llama, "{11}", k_participants.at(1));
-    prompt_llama = ::replace(prompt_llama, "{12}", k_participants.at(2));
-    prompt_llama = ::replace(prompt_llama, "{13}", k_participants.at(3));
-
-    {
-        // get date string
-        std::string date_str;
-        {
-            time_t t = time(0);
-            struct tm * now = localtime(&t);
-            char buf[128];
-            strftime(buf, sizeof(buf), "%d/%m/%Y", now);
-            date_str = buf;
-        }
-        prompt_llama = ::replace(prompt_llama, "{1}", date_str);
-    }
-
-    {
-        // get time string
-        std::string time_str;
-        {
-            time_t t = time(0);
-            struct tm * now = localtime(&t);
-            char buf[128];
-            strftime(buf, sizeof(buf), "%H:%M", now);
-            time_str = buf;
-        }
-        prompt_llama = ::replace(prompt_llama, "{2}", time_str);
-    }
-
-    {
-        // get year string
-        std::string year_str;
-        {
-            time_t t = time(0);
-            struct tm * now = localtime(&t);
-            char buf[128];
-            strftime(buf, sizeof(buf), "%Y", now);
-            year_str = buf;
-        }
-        prompt_llama = ::replace(prompt_llama, "{3}", year_str);
-    }
-
-    prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);
-
-    // evaluate the initial prompt
-
-    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);
-
-    printf("\n");
-    printf("%s : initializing - please wait ...\n", __func__);
-
-    if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0, params.n_threads)) {
-        fprintf(stderr, "%s : failed to eval\n", __func__);
-        return 1;
-    }
-
-    if (params.verbose_prompt) {
-        fprintf(stdout, "\n");
-        fprintf(stdout, "%s", prompt_whisper.c_str());
-        fprintf(stdout, "\n");
-
-        fprintf(stdout, "\n");
-        fprintf(stdout, "%s", prompt_llama.c_str());
-        fprintf(stdout, "\n");
-        fprintf(stdout, "\n");
-        fflush(stdout);
-    }
-
-    printf("%s : done! start speaking in the microphone\n", __func__);
-    printf("\n");
-    printf("%s%s", name_ni.c_str(), chat_symb.c_str());
-    fflush(stdout);
-
-    // clear audio buffer
-    audio.clear();
-
-    // text inference variables
-    const int voice_id = params.voice_id;
-    const int n_keep   = embd_inp.size();
-    const int n_ctx    = llama_n_ctx(ctx_llama);
-
-    int n_past = n_keep;
-    int n_prev = 64; // TODO arg
-
-    std::vector<llama_token> embd;
-
-    // reverse prompts for detecting when it's time to stop speaking
-    std::vector<std::string> antiprompts = {
-        name_ni + chat_symb,
-    };
-
-    for (const auto & p : k_participants) {
-        antiprompts.push_back(p + chat_symb);
-    }
-
-    std::string text_heard_all;
-
-    // main loop
-    while (is_running) {
-        // handle Ctrl + C
-        is_running = sdl_poll_events();
-
-        if (!is_running) {
-            break;
-        }
-
-        // delay
-        std::this_thread::sleep_for(std::chrono::milliseconds(100));
-
-        int64_t t_ms = 0;
-
-        {
-            audio.get(15000, pcmf32_cur);
-
-            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
-                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
-
-                audio.get(params.voice_ms, pcmf32_cur);
-
-                std::string text_heard;
-
-                if (!force_speak) {
-                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\[.*?\\]");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove text between brackets using regex
-                {
-                    std::regex re("\\(.*?\\)");
-                    text_heard = std::regex_replace(text_heard, re, "");
-                }
-
-                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
-                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
-
-                // take first line
-                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
-
-                // remove leading and trailing whitespace
-                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
-                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
-
-                const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
-
-                if (text_heard.empty() || tokens.empty() || force_speak) {
-                    //fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
-                    audio.clear();
-
-                    continue;
-                }
-
-                force_speak = false;
-
-                if (text_heard[0] != ' ') {
-                    text_heard.insert(0, 1, ' ');
-                }
-
-                // replace homophones
-                for (const auto & homophone : k_homophones) {
-                    for (const auto & word : homophone.second) {
-                        text_heard = ::replace(text_heard, word, homophone.first);
-                    }
-                }
-
-                // check which participant was mentioned
-                const auto name_ref_old = name_ref;
-                for (const auto & participant : k_participants) {
-                    if (participant == name_ref) {
-                        continue;
-                    }
-
-                    if (text_heard.find(participant) != std::string::npos) {
-                        name_ref = participant;
-                        break;
-                    }
-                }
-                if (name_ref == name_ref_old && name_ref != name_ai) {
-                    name_ref = name_ni;
-                }
-
-                text_heard += "\n" + name_ref + chat_symb;
-                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
-                fflush(stdout);
-
-                text_heard_all += text_heard;
-                // keep only last 100 characters
-                if (text_heard_all.size() > 100) {
-                    text_heard_all = text_heard_all.substr(text_heard_all.size() - 100);
-                }
-
-                if (name_ref != name_ai) {
-                } else {
-                    // text inference
-                    bool done = false;
-                    std::string text_to_speak;
-
-                    embd = ::llama_tokenize(ctx_llama, text_heard_all, false);
-                    text_heard_all.clear();
-
-                    while (true) {
-                        // predict
-                        if (embd.size() > 0) {
-                            if (n_past + (int) embd.size() > n_ctx) {
-                                n_past = n_keep;
-
-                                // insert n_left/2 tokens at the start of embd from last_n_tokens
-                                embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
-
-                                //printf("\n---\n");
-                                //printf("resetting: '");
-                                //for (int i = 0; i < (int) embd.size(); i++) {
-                                //    printf("%s", llama_token_to_str(ctx_llama, embd[i]));
-                                //}
-                                //printf("'\n");
-                                //printf("\n---\n");
-                            }
-
-                            if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
-                                fprintf(stderr, "%s : failed to eval\n", __func__);
-                                return 1;
-                            }
-                        }
-
-                        //printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
-
-                        embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
-                        n_past += embd.size();
-                        embd.clear();
-
-                        if (done) break;
-
-                        {
-                            // out of user input, sample next token
-                            const float top_k          = 5;
-                            const float top_p          = 0.80f;
-                            const float temp           = 0.20f;
-                            const float repeat_penalty = 1.0764f;
-
-                            const int repeat_last_n    = 256;
-
-                            llama_token id = 0;
-
-                            {
-                                auto logits = llama_get_logits(ctx_llama);
-                                logits[llama_token_eos()] = 0;
-
-                                id = llama_sample_top_p_top_k(ctx_llama,
-                                        embd_inp.data() + std::max(0, n_past - repeat_last_n),
-                                        repeat_last_n, top_k, top_p, temp, repeat_penalty);
-                            }
-
-                            if (id != llama_token_eos()) {
-                                // add it to the context
-                                embd.push_back(id);
-
-                                text_to_speak += llama_token_to_str(ctx_llama, id);
-
-                                printf("%s", llama_token_to_str(ctx_llama, id));
-                            }
-
-                            // new line
-                            if (id == 13) {
-                            }
-                        }
-
-                        {
-                            std::string last_output;
-                            for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
-                                last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
-                            }
-                            last_output += llama_token_to_str(ctx_llama, embd[0]);
-
-                            for (const std::string & antiprompt : antiprompts) {
-                                if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
-                                    done = true;
-                                    text_to_speak = ::replace(text_to_speak, antiprompt, "");
-                                    fflush(stdout);
-                                    break;
-                                }
-                            }
-                        }
-
-                        is_running = sdl_poll_events();
-
-                        if (!is_running) {
-                            break;
-                        }
-                    }
-
-                    text_to_speak = ::replace(text_to_speak, "\"", "");
-                    system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
-                }
-
-                audio.clear();
-
-                ++n_iter;
-            }
-        }
-    }
-
-    audio.pause();
-
-    whisper_print_timings(ctx_wsp);
-    whisper_free(ctx_wsp);
-
-    llama_print_timings(ctx_llama);
-    llama_free(ctx_llama);
-
-    return 0;
-}
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -325,12 +325,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ false,
-        };
-
+        struct ggml_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = NULL;

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -531,11 +528,9 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;

    struct ggml_context * ctx0 = ggml_init(params);

--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -31,7 +31,7 @@ To run this, you will need a ggml GPT-2 model: [instructions](https://github.com
 Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:

 ```
-wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
+wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
 ```

 ## TTS
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -325,11 +325,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ ctx_size,
-            /*.mem_buffer =*/ nullptr,
-            /*.no_alloc   =*/ false,
-        };
+        struct ggml_init_params params;
+        params.mem_size   = ctx_size;
+        params.mem_buffer = nullptr;

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -530,11 +528,9 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params = {
-        /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf,
-        /*.no_alloc   =*/ false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = buf_size;
+    params.mem_buffer = buf;

    struct ggml_context * ctx0 = ggml_init(params);

--- a/examples/talk/speak.sh
+++ b/examples/talk/speak.sh
@ -7,10 +7,7 @@
 # Mac OS: brew install espeak
 # Linux: apt-get install espeak
 #
-#espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
-
-# Mac OS "say" command
-say "$2"
+espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"

 # Eleven Labs
 #
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -24,5 +24,3 @@ Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Pha
 This can significantly improve the performance of the transcription:

 <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
-
-In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -296,10 +296,6 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
-				OTHER_CFLAGS = (
-					"-O3",
-					"-DNDEBUG",
-				);
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
 			};
--- a/examples/whisper.swiftui/README.md
+++ b/examples/whisper.swiftui/README.md
@ -1,18 +1,14 @@
 A sample SwiftUI app using [whisper.cpp](https://github.com/ggerganov/whisper.cpp/) to do voice-to-text transcriptions.
 See also: [whisper.objc](https://github.com/ggerganov/whisper.cpp/tree/master/examples/whisper.objc).

-**Usage**:
+To use:

 1. Select a model from the [whisper.cpp repository](https://github.com/ggerganov/whisper.cpp/tree/master/models).[^1]
-2. Add the model to `whisper.swiftui.demo/Resources/models` **via Xcode**.
+2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
 3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
-4. Add the sample audio file to `whisper.swiftui.demo/Resources/samples` **via Xcode**.
-5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device.
-
-**Note:** Pay attention to the folder path: `whisper.swiftui.demo/Resources/models` is the appropriate directory to place resources whilst `whisper.swiftui.demo/Models` is related to actual code.
+4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
+5. Select the "release" build configuration under "Run", then deploy and run to your device.

 [^1]: I recommend the tiny, base or small models for running on an iOS device.

-[^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
-
 ![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -430,10 +430,6 @@
 				LLVM_LTO = YES;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MARKETING_VERSION = 1.0;
-				OTHER_CFLAGS = (
-					"-O3",
-					"-DNDEBUG",
-				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -198,8 +198,6 @@ struct ggml_object;
 struct ggml_context;

 enum ggml_type {
-    GGML_TYPE_Q4_0,
-    GGML_TYPE_Q4_1,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@ -228,9 +226,7 @@ enum ggml_op {
    GGML_OP_STEP,
    GGML_OP_RELU,
    GGML_OP_GELU,
-    GGML_OP_SILU,
    GGML_OP_NORM, // normalize
-    GGML_OP_RMS_NORM,

    GGML_OP_MUL_MAT,

@ -316,7 +312,6 @@ struct ggml_init_params {
    // memory pool
    size_t mem_size;   // bytes
    void * mem_buffer; // if NULL, memory will be allocated internally
-    bool   no_alloc;   // don't allocate memory for the tensor data
 };

 void    ggml_time_init(void); // call this once at the beginning of the program
@ -331,10 +326,7 @@ void ggml_print_objects(const struct ggml_context * ctx);
 int    ggml_nelements(const struct ggml_tensor * tensor);
 size_t ggml_nbytes   (const struct ggml_tensor * tensor);

-int    ggml_blck_size (enum ggml_type type);
-size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
-float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
-
+size_t ggml_type_size   (enum ggml_type type);
 size_t ggml_element_size(const struct ggml_tensor * tensor);

 struct ggml_context * ggml_init(struct ggml_init_params params);
@ -344,13 +336,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);

 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

-bool ggml_mlock_supported(void);
-bool ggml_mlock(
-        struct ggml_context * ctx,
-        const void *opt_extra_addr,
-        size_t opt_extra_len,
-        char **err_p);
-
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
@ -481,20 +466,12 @@ struct ggml_tensor * ggml_gelu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

-struct ggml_tensor * ggml_silu(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
 // normalize along rows
 // TODO: eps is hardcoded to 1e-5 for now
 struct ggml_tensor * ggml_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

-struct ggml_tensor * ggml_rms_norm(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a);
-
 // A: m rows, n columns
 // B: p rows, n columns (i.e. we transpose it internally)
 // result is m columns, p rows
@ -749,13 +726,6 @@ enum ggml_opt_result ggml_opt(
        struct ggml_opt_params params,
        struct ggml_tensor * f);

-//
-// quantization
-//
-
-size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
-size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
-
 //
 // system info
 //
--- a/models/README.md
+++ b/models/README.md
@ -6,7 +6,7 @@ using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either
 the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
 script to download the already converted models. Currently, they are hosted on the following locations:

- https://huggingface.co/ggerganov/whisper.cpp
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

 Sample usage:
@ -23,7 +23,7 @@ You can now use it like this:

 A third option to obtain the model files is to download them from Hugging Face:

-https://huggingface.co/ggerganov/whisper.cpp/tree/main
+https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main

 ## Available models

--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@ -79,11 +79,11 @@ dir_model   = sys.argv[1]
 dir_whisper = sys.argv[2]
 dir_out     = sys.argv[3]

-with open(dir_model + "/vocab.json", "r", encoding="utf8") as f:
+with open(dir_model + "/vocab.json", "r") as f:
    encoder = json.load(f)
-with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f:
+with open(dir_model + "/added_tokens.json", "r") as f:
    encoder_added = json.load(f)
-with open(dir_model + "/config.json", "r", encoding="utf8") as f:
+with open(dir_model + "/config.json", "r") as f:
    hparams = json.load(f)

 model = WhisperForConditionalGeneration.from_pretrained(dir_model)
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -40,7 +40,7 @@ if exist "ggml-%model%.bin" (
  goto :eof
 )

-PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
+PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"

 if %ERRORLEVEL% neq 0 (
  echo Failed to download ggml model %model%
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -6,13 +6,13 @@
 #src="https://ggml.ggerganov.com"
 #pfx="ggml-model-whisper"

-src="https://huggingface.co/ggerganov/whisper.cpp"
+src="https://huggingface.co/datasets/ggerganov/whisper.cpp"
 pfx="resolve/main/ggml"

 # get the path of this script
 function get_script_path() {
    if [ -x "$(command -v realpath)" ]; then
-        echo "$(dirname "$(realpath "$0")")"
+        echo "$(dirname $(realpath $0))"
    else
        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
        echo "$ret"
--- a/talk-ggama.sh
+++ b/talk-ggama.sh
@ -1,6 +0,0 @@
-./talk-llama \
-    -mw ./models/ggml-small.en.bin \
-    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
-    --name-ni "Georgi" \
-    --name-ai "GGaMA" \
-    -t 8 -vid 1 --speak ./examples/talk-llama/speak.sh
--- a/talk-llama.sh
+++ b/talk-llama.sh
@ -1,6 +0,0 @@
-./talk-llama \
-    -mw ./models/ggml-small.en.bin \
-    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
-    --name-ni "Georgi" \
-    --name-ai "LLaMA" \
-    -t 8 -vid 0 --speak ./examples/talk-llama/speak.sh
--- a/talk-rrama.sh
+++ b/talk-rrama.sh
@ -1,6 +0,0 @@
-./talk-llama \
-    -mw ./models/ggml-small.en.bin \
-    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
-    --name-ni "Georgi" \
-    --name-ai "RRaMA" \
-    -t 8 -vid 3 --speak ./examples/talk-llama/speak.sh
--- a/talk-ssama.sh
+++ b/talk-ssama.sh
@ -1,6 +0,0 @@
-./talk-llama \
-    -mw ./models/ggml-small.en.bin \
-    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
-    --name-ni "Georgi" \
-    --name-ai "SSaMA" \
-    -t 8 -vid 2 --speak ./examples/talk-llama/speak.sh
--- a/whisper.cpp
+++ b/whisper.cpp
@ -218,14 +218,14 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
    { "su",  { 98,  "sundanese",      } },
 };

-static const size_t MB = 1ull*1024*1024;
+static const size_t MB = 1024*1024;

 static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
-    { MODEL_TINY,     14ull*MB },
-    { MODEL_BASE,     18ull*MB },
-    { MODEL_SMALL,    28ull*MB },
-    { MODEL_MEDIUM,   36ull*MB },
-    { MODEL_LARGE,    44ull*MB },
+    { MODEL_TINY,     12ull*MB },
+    { MODEL_BASE,     15ull*MB },
+    { MODEL_SMALL,    23ull*MB },
+    { MODEL_MEDIUM,   31ull*MB },
+    { MODEL_LARGE,    38ull*MB },
 };

 static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
@ -631,13 +631,12 @@ struct whisper_context {
    int64_t t_load_us = 0;
    int64_t t_start_us = 0;

+
    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16)

    whisper_model model;
    whisper_vocab vocab;
    whisper_state * state = nullptr;
-
-    std::string path_model; // populated by whisper_init_from_file()
 };

 template<typename T>
@ -654,11 +653,9 @@ static bool kv_cache_init(
                                 int   n_ctx) {
    cache.buf.resize(mem_bytes);

-    struct ggml_init_params params = {
-        /*.mem_size   =*/ cache.buf.size(),
-        /*.mem_buffer =*/ cache.buf.data(),
-        /*.no_alloc   =*/ false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = cache.buf.size();
+    params.mem_buffer = cache.buf.data();

    cache.ctx = ggml_init(params);

@ -690,11 +687,9 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {

    WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));

-    struct ggml_init_params params = {
-        /*.mem_size   =*/ cache.buf.size(),
-        /*.mem_buffer =*/ cache.buf.data(),
-        /*.no_alloc   =*/ false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = cache.buf.size();
+    params.mem_buffer = cache.buf.data();

    cache.ctx = ggml_init(params);

@ -1032,11 +1027,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con

    // create the ggml context
    {
-        struct ggml_init_params params = {
-            /*.mem_size   =*/ wctx.model.buf->size(),
-            /*.mem_buffer =*/ wctx.model.buf->data(),
-            /*.no_alloc   =*/ false,
-        };
+        struct ggml_init_params params;
+        params.mem_size   = wctx.model.buf->size();
+        params.mem_buffer = wctx.model.buf->data();

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -1350,11 +1343,9 @@ static bool whisper_encode_internal(
    const int n_mels = hparams.n_mels;
    assert(mel_inp.n_mel == n_mels);

-    struct ggml_init_params params = {
-        /*.mem_size   =*/ wstate.buf_compute.size(),
-        /*.mem_buffer =*/ wstate.buf_compute.data(),
-        /*.no_alloc   =*/ false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = wstate.buf_compute.size();
+    params.mem_buffer = wstate.buf_compute.data();

    struct ggml_context * ctx0 = ggml_init(params);

@ -1418,7 +1409,7 @@ static bool whisper_encode_internal(
    //}

    static int iter = 0;
-
+    
    const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
    const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;

@ -1607,7 +1598,7 @@ static bool whisper_encode_internal(
                        ggml_repeat(ctx0, layer.mlp_ln_w, cur),
                        cur),
                    ggml_repeat(ctx0, layer.mlp_ln_b, cur));
-            }
+    }

 #ifdef WHISPER_USE_FLASH_FF
            wstate.use_buf(ctx0, 0);
@ -1647,7 +1638,7 @@ static bool whisper_encode_internal(
                ggml_repeat(ctx0, layer.mlp_1_b, cur),
                cur);
 #endif
-        }
+}

        wstate.use_buf(ctx0, 3);

@ -1751,10 +1742,10 @@ static bool whisper_encode_internal(

    //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
    //        ggml_used_mem(ctx0)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(1)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(2)/1024.0/1024.0,
-    //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
+    //        wctx.get_buf_max_mem(0)/1024.0/1024.0,
+    //        wctx.get_buf_max_mem(1)/1024.0/1024.0,
+    //        wctx.get_buf_max_mem(2)/1024.0/1024.0,
+    //        wctx.get_buf_max_mem(3)/1024.0/1024.0);

    ggml_free(ctx0);

@ -1805,11 +1796,9 @@ static bool whisper_decode_internal(

    //WHISPER_PRINT_DEBUG("%s: n_past = %d, N = %d, M = %d, n_ctx = %d\n", __func__, n_past, N, M, n_ctx);

-    struct ggml_init_params params = {
-        /*.mem_size   =*/ wstate.buf_compute.size(),
-        /*.mem_buffer =*/ wstate.buf_compute.data(),
-        /*.no_alloc   =*/ false,
-    };
+    struct ggml_init_params params;
+    params.mem_size   = wstate.buf_compute.size();
+    params.mem_buffer = wstate.buf_compute.data();

    struct ggml_context * ctx0 = ggml_init(params);

@ -1853,6 +1842,8 @@ static bool whisper_decode_internal(

        // self-attention
        {
+            wstate.use_buf(ctx0, 1);
+
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
                    layer.attn_q_w,
                    cur);
@ -1914,6 +1905,8 @@ static bool whisper_decode_internal(
            // K * Q
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

+            wstate.use_buf(ctx0, 0);
+
            //struct ggml_tensor * KQ_scaled =
            //    ggml_scale(ctx0,
            //            KQ,
@ -1922,16 +1915,20 @@ static bool whisper_decode_internal(

            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);

+            wstate.use_buf(ctx0, 1);
+
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);

+            wstate.use_buf(ctx0, 0);
+
            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
-                                n_state/n_head, n_head, n_past + N),
-                            1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_state/n_head, n_head));
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
+                            n_state/n_head, n_head, n_past + N),
+                        1, 2, 0, 3);
+
+            wstate.use_buf(ctx0, 1);

            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);

@ -1968,6 +1965,8 @@ static bool whisper_decode_internal(

            cur = ggml_norm(ctx0, inpCA); // note: we use inpCA here

+            wstate.use_buf(ctx0, 1);
+
            // cur = ln_0_w*cur + ln_0_b
            cur = ggml_add(ctx0,
                    ggml_mul(ctx0,
@ -1978,6 +1977,8 @@ static bool whisper_decode_internal(

        // cross-attention
        {
+            wstate.use_buf(ctx0, 0);
+
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
                    layer.cross_attn_q_w,
                    cur);
@ -2001,13 +2002,12 @@ static bool whisper_decode_internal(
                        ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
                        n_state/n_head, n_head, M);

-            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
+            struct ggml_tensor * V_trans = ggml_permute(ctx0, Vcross, 1, 2, 0, 3);

            // ------

+            wstate.use_buf(ctx0, 1);
+
            struct ggml_tensor * Q =
                ggml_permute(ctx0,
                        ggml_cpy(ctx0,
@ -2017,6 +2017,8 @@ static bool whisper_decode_internal(

            struct ggml_tensor * K = ggml_permute(ctx0, Kcross, 0, 2, 1, 3);

+            wstate.use_buf(ctx0, 0);
+
            // K * Q
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);

@ -2029,10 +2031,16 @@ static bool whisper_decode_internal(
            // no masking for cross-attention
            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);

+            wstate.use_buf(ctx0, 1);
+
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);

+            wstate.use_buf(ctx0, 0);
+
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);

+            wstate.use_buf(ctx0, 1);
+
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);

            // cur = KQV_merged.contiguous().view(n_state, N)
@ -2152,6 +2160,12 @@ static bool whisper_decode_internal(
        ggml_graph_compute       (ctx0, &gf);
    }

+    // print the time for computing the last ggml_mul_mat that computes logits
+    // also print the total decoder time
+    // these need to be called after ggml_graph_compute()
+    printf("logits t = %7.3f ms (%2d runs, N = %3d, ggml_mul_mat: [%d x %d] * [%d x %d])\n", 1e-3*double(logits->perf_time_us)/logits->perf_runs, logits->perf_runs, N, logits->ne[0], logits->ne[1], cur->ne[1], cur->ne[0]);
+    printf("total  t = %7.3f ms (%2d runs)\n", 1e-3*double(gf.perf_time_us)/gf.perf_runs, gf.perf_runs);
+
    // extract logits for all N tokens
    //logits_out.resize(N*n_vocab);
    //memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab);
@ -2163,10 +2177,10 @@ static bool whisper_decode_internal(
    if (N > 1) {
        //printf("%s: used_mem = %f MB, %f MB, %f MB %f MB %f MB\n", __func__,
        //        ggml_used_mem(ctx0)/1024.0/1024.0,
-        //        wstate.get_buf_max_mem(0)/1024.0/1024.0,
-        //        wstate.get_buf_max_mem(1)/1024.0/1024.0,
-        //        wstate.get_buf_max_mem(2)/1024.0/1024.0,
-        //        wstate.get_buf_max_mem(3)/1024.0/1024.0);
+        //        wctx.get_buf_max_mem(0)/1024.0/1024.0,
+        //        wctx.get_buf_max_mem(1)/1024.0/1024.0,
+        //        wctx.get_buf_max_mem(2)/1024.0/1024.0,
+        //        wctx.get_buf_max_mem(3)/1024.0/1024.0);
    }

    ggml_free(ctx0);
@ -2475,6 +2489,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {

    const size_t scale = ctx->model.hparams.f16 ? 1 : 2;

+
    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
        fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
        return nullptr;
@ -2495,6 +2510,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
        fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
    }

+
    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);

    state->logits_id.reserve(ctx->model.hparams.n_vocab);
@ -2545,13 +2561,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
        fin->close();
    };

-    auto ctx = whisper_init_no_state(&loader);
-
-    if (ctx) {
-        ctx->path_model = path_model;
-    }
-
-    return ctx;
+    return whisper_init_no_state(&loader);
 }

 struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) {
@ -2852,7 +2862,7 @@ int whisper_lang_auto_detect_with_state(
    }

    // run the encoder
-    if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
+    if (whisper_encode(ctx, seek, n_threads) != 0) {
        fprintf(stderr, "%s: failed to encode\n", __func__);
        return -6;
    }
@ -2916,71 +2926,6 @@ int whisper_lang_auto_detect(
    return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
 }

-int whisper_model_n_vocab(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_vocab;
-}
-
-int whisper_model_n_audio_ctx(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_ctx;
-}
-
-int whisper_model_n_audio_state(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_state;
-}
-
-int whisper_model_n_audio_head(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_head;
-}
-
-int whisper_model_n_audio_layer(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_audio_layer;
-}
-
-int whisper_model_n_text_ctx(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_ctx;
-}
-
-int whisper_model_n_text_state(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_state;
-}
-
-int whisper_model_n_text_head(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_head;
-}
-
-int whisper_model_n_text_layer(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_text_layer;
-}
-
-int whisper_model_n_mels(struct whisper_context * ctx) {
-    return ctx->model.hparams.n_mels;
-}
-
-int whisper_model_f16(struct whisper_context * ctx) {
-    return ctx->model.hparams.f16;
-}
-
-int whisper_model_type(struct whisper_context * ctx) {
-    return ctx->model.type;
-}
-
-const char *whisper_model_type_readable(struct whisper_context * ctx) {
-    switch (ctx->model.type) {
-    case e_model::MODEL_TINY:
-        return "tiny";
-    case e_model::MODEL_BASE:
-        return "base";
-    case e_model::MODEL_SMALL:
-        return "small";
-    case e_model::MODEL_MEDIUM:
-        return "medium";
-    case e_model::MODEL_LARGE:
-        return "large";
-    default:
-        return "unknown";
-    }
-}
-
 int whisper_n_len_from_state(struct whisper_state * state) {
    return state->mel.n_len;
 }
@ -3131,7 +3076,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.speed_up         =*/ false,
        /*.audio_ctx        =*/ 0,

-        /*.initial_prompt   =*/ nullptr,
        /*.prompt_tokens    =*/ nullptr,
        /*.prompt_n_tokens  =*/ 0,

@ -3144,7 +3088,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.max_initial_ts   =*/  1.0f,
        /*.length_penalty   =*/ -1.0f,

-        /*.temperature_inc  =*/  0.0f, // TODO: temporary disabled until improve performance
+        /*.temperature_inc  =*/  0.2f,
        /*.entropy_thold    =*/  2.4f,
        /*.logprob_thold    =*/ -1.0f,
        /*.no_speech_thold  =*/  0.6f,
@ -3162,9 +3106,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.new_segment_callback           =*/ nullptr,
        /*.new_segment_callback_user_data =*/ nullptr,

-        /*.progress_callback           =*/ nullptr,
-        /*.progress_callback_user_data =*/ nullptr,
-
        /*.encoder_begin_callback           =*/ nullptr,
        /*.encoder_begin_callback_user_data =*/ nullptr,

@ -3807,15 +3748,6 @@ int whisper_full_with_state(
        prompt_past.clear();
    }

-    // initial prompt
-    if (!params.prompt_tokens && params.initial_prompt) {
-        std::vector<whisper_token> prompt_tokens;
-        prompt_tokens.resize(1024);
-        prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()));
-        params.prompt_tokens = prompt_tokens.data();
-        params.prompt_n_tokens = prompt_tokens.size();
-    }
-
    // prepend the prompt tokens to the prompt_past
    if (params.prompt_tokens && params.prompt_n_tokens > 0) {
        // parse tokens from the pointer
@ -3881,10 +3813,6 @@ int whisper_full_with_state(
                fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress_prev);
            }
        }
-        if (params.progress_callback) {
-            params.progress_callback(
-                ctx, ctx->state, progress_prev, params.progress_callback_user_data);
-        }

        // of only 1 second left, then stop
        if (seek + 100 >= seek_end) {
@ -4473,9 +4401,6 @@ int whisper_full_parallel(
        params_cur.new_segment_callback = nullptr;
        params_cur.new_segment_callback_user_data = nullptr;

-        params_cur.progress_callback = nullptr;
-        params_cur.progress_callback_user_data = nullptr;
-
        workers[i] = std::thread(whisper_full_with_state, ctx, states[i], std::move(params_cur), samples + start_samples, n_samples_cur);
    }

@ -4736,7 +4661,6 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
            struct ggml_init_params gparams = {
                /*.mem_size   =*/ buf.size(),
                /*.mem_buffer =*/ buf.data(),
-                /*.no_alloc   =*/ false,
            };

            struct ggml_context * ctx0 = ggml_init(gparams);
--- a/whisper.h
+++ b/whisper.h
@ -248,19 +248,6 @@ extern "C" {
    WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
    WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);

-    WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_f16          (struct whisper_context * ctx);
-    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
-
    // Token logits obtained from the last call to whisper_decode()
    // The logits for the last token are stored in the last row
    // Rows: n_tokens
@ -270,8 +257,6 @@ extern "C" {

    // Token Id -> String. Uses the vocabulary in the provided context
    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
-    WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
-

    // Special tokens
    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
@ -306,9 +291,6 @@ extern "C" {
    // Use the whisper_full_...() functions to obtain the text segments
    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);

-    // Progress callback
-    typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);
-
    // Encoder begin callback
    // If not NULL, called before the encoder starts
    // If it returns false, the computation is aborted
@ -359,7 +341,6 @@ extern "C" {

        // tokens to provide to the whisper decoder as initial prompt
        // these are prepended to any existing text context from a previous call
-        const char * initial_prompt;
        const whisper_token * prompt_tokens;
        int prompt_n_tokens;

@ -395,10 +376,6 @@ extern "C" {
        whisper_new_segment_callback new_segment_callback;
        void * new_segment_callback_user_data;

-        // called on each progress update
-        whisper_progress_callback progress_callback;
-        void * progress_callback_user_data;
-
        // called each time before the encoder starts
        whisper_encoder_begin_callback encoder_begin_callback;
        void * encoder_begin_callback_user_data;