llama podcast

whisper : add progress callback (#600 )
misc : typo (#688 )
2025-07-09 03:37:20 +02:00 · 2023-04-01 13:13:27 +03:00 · 2023-03-30 20:29:29 +03:00 · 2023-03-30 07:51:33 +03:00 · 2023-03-30 00:10:20 +03:00 · 2023-03-29 23:59:45 +03:00
50 changed files with 6850 additions and 1203 deletions
--- a/.gitignore
+++ b/.gitignore
@ -18,6 +18,7 @@ build-sanitize-thread/
 /stream
 /command
 /talk
+/talk-llama
 /bench

 arm_neon.h
@ -32,3 +33,5 @@ examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata

 extra/bench-gg.txt
+
+*.mlmodel*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -172,7 +172,9 @@ else()
            if(NOT WHISPER_NO_FMA)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
            endif()
-            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
+            if(NOT WHISPER_NO_F16C)
+                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
+            endif()
        endif()
    endif()
 endif()
--- a/32
+++ b/32
@ -30,10 +30,16 @@ endif
 # Compile flags
 #

-CFLAGS   = -I.              -O3 -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
+CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
+CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =

+# ref: https://github.com/ggerganov/whisper.cpp/issues/37
+ifneq ($(wildcard /usr/include/musl/*),)
+	CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
+	CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
+endif
+
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@ -145,12 +151,15 @@ ifneq ($(filter aarch64%,$(UNAME_M)),)
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
-	# Raspberry Pi 1, 2, 3
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+	# 32-bit Raspberry Pi 1, 2, 3
+	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access
 endif
 ifneq ($(filter armv7%,$(UNAME_M)),)
-	# Raspberry Pi 4
-	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	# 32-bit ARM, for example on Armbian or possibly raspbian
+	CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+	
+	# 64-bit ARM, use these (TODO: auto-detect 64-bit)
+	# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
 endif
 ifneq ($(filter armv8%,$(UNAME_M)),)
 	# Raspberry Pi 4
@ -172,7 +181,7 @@ $(info I CC:       $(CCV))
 $(info I CXX:      $(CXXV))
 $(info )

-default: main
+default: main bench

 #
 # Build library
@ -191,7 +200,7 @@ libwhisper.so: ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)

 clean:
-	rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
+	rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so

 #
 # Examples
@ -206,6 +215,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
 	./main -h

+bench: examples/bench/bench.cpp ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
+
 stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)

@ -215,8 +227,8 @@ command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whi
 talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
 	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)

-bench: examples/bench/bench.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
+talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+	$(CXX) $(CXXFLAGS) examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk-llama $(CC_SDL) $(LDFLAGS)

 #
 # Audio samples
--- a/README.md
+++ b/README.md
@ -313,7 +313,7 @@ whisper_print_timings:    total time = 32733.52 ms
 ## Real-time audio input example

 This is a naive example of performing real-time inference on audio from your microphone.
-The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continously.
+The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
 More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).

 ```java
@ -433,6 +433,19 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a

 ---

+## Video comparison of different models
+
+Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:
+
+```java
+./extra/bench-wts.sh samples/jfk.wav
+ffplay ./samples/jfk.wav.all.mp4
+```
+
+https://user-images.githubusercontent.com/1991296/223206245-2d36d903-cf8e-4f09-8c3b-eb9f9c39d6fc.mp4
+
+---
+
 ## Benchmarks

 In order to have an objective comparison of the performance of the inference across different system configurations,
@ -453,7 +466,7 @@ The original models are converted to a custom binary format. This allows to pack
 You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
 or manually from here:

- https://huggingface.co/datasets/ggerganov/whisper.cpp
+- https://huggingface.co/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

 For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
@ -463,6 +476,7 @@ in [models](models).

 - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
 - [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
+  - React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
 - [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
 - [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
@ -472,6 +486,7 @@ in [models](models).
 - [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
+- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)

 ## Examples

@ -485,6 +500,7 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
 | [stream](examples/stream) | [stream.wasm](examples/stream.wasm) | Real-time transcription of raw microphone capture |
 | [command](examples/command) | [command.wasm](examples/command.wasm) | Basic voice assistant example for receiving voice commands from the mic |
 | [talk](examples/talk) | [talk.wasm](examples/talk.wasm) | Talk with a GPT-2 bot |
+| [talk-llama](examples/talk-llama) | | Talk with a LLaMA bot |
 | [whisper.objc](examples/whisper.objc) | | iOS mobile application using whisper.cpp |
 | [whisper.swiftui](examples/whisper.swiftui) | | SwiftUI iOS / macOS application using whisper.cpp |
 | [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -17,9 +17,9 @@ import (
 // CONSTANTS

 const (
-	srcUrl  = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main" // The location of the models
-	srcExt  = ".bin"                                                               // Filename extension
-	bufSize = 1024 * 64                                                            // Size of the buffer used for downloading the model
+	srcUrl  = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models
+	srcExt  = ".bin"                                                      // Filename extension
+	bufSize = 1024 * 64                                                   // Size of the buffer used for downloading the model
 )

 var (
--- a/bindings/go/pkg/whisper/model.go
+++ b/bindings/go/pkg/whisper/model.go
@ -94,6 +94,7 @@ func (model *model) NewContext() (Context, error) {
 	params.SetPrintRealtime(false)
 	params.SetPrintTimestamps(false)
 	params.SetThreads(runtime.NumCPU())
+	params.SetNoContext(true)

 	// Return new context
 	return newContext(model, params)
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -20,7 +20,7 @@ extern bool callEncoderBegin(void* user_data);
 // Text segment callback
 // Called on every newly generated text segment
 // Use the whisper_full_...() functions to obtain the text segments
-static void whisper_new_segment_cb(struct whisper_context* ctx, int n_new, void* user_data) {
+static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_state* state, int n_new, void* user_data) {
    if(user_data != NULL && ctx != NULL) {
        callNewSegment(user_data, n_new);
    }
@ -29,7 +29,7 @@ static void whisper_new_segment_cb(struct whisper_context* ctx, int n_new, void*
 // Encoder begin callback
 // If not NULL, called before the encoder starts
 // If it returns false, the computation is aborted
-static bool whisper_encoder_begin_cb(struct whisper_context* ctx, void* user_data) {
+static bool whisper_encoder_begin_cb(struct whisper_context* ctx, struct whisper_state* state, void* user_data) {
    if(user_data != NULL && ctx != NULL) {
        return callEncoderBegin(user_data);
    }
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@ -199,7 +199,7 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
  {
    static bool is_aborted = false; // NOTE: this should be atomic to avoid data race

-    rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
+    rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
      bool is_aborted = *(bool*)user_data;
      return !is_aborted;
    };
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -63,4 +63,5 @@ else()
    add_subdirectory(command)
    add_subdirectory(bench)
    add_subdirectory(talk)
+    add_subdirectory(talk-llama)
 endif()
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -1,15 +1,22 @@
-const path = require('path');
-const { whisper } = require(path.join(__dirname, '../../../build/Release/whisper-addon'));
+const path = require("path");
+const { whisper } = require(path.join(
+  __dirname,
+  "../../../build/Release/whisper-addon"
+));
+const { promisify } = require("util");
+
+const whisperAsync = promisify(whisper);

 const whisperParamsMock = {
-    language: 'en',
-    model: path.join(__dirname, '../../../models/ggml-base.en.bin'),
-    fname_inp: path.join(__dirname, '../../../samples/jfk.wav'),
+  language: "en",
+  model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
+  fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
 };

 describe("Run whisper.node", () => {
+  test("it should receive a non-empty value", async () => {
+    let result = await whisperAsync(whisperParamsMock);

-    test("it should receive a non-empty value", () => {
-        expect(whisper(whisperParamsMock).length).toBeGreaterThan(0);
-    });
+    expect(result.length).toBeGreaterThan(0);
+  });
 });
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -72,7 +72,7 @@ int timestamp_to_sample(int64_t t, int n_samples) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
 }

-void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
+void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

@ -160,22 +160,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
        return 3;
    }

-    // initial prompt
-    std::vector<whisper_token> prompt_tokens;
-
-    if (!params.prompt.empty()) {
-        prompt_tokens.resize(1024);
-        prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
-
-        fprintf(stderr, "\n");
-        fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
-        fprintf(stderr, "initial tokens: [ ");
-        for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
-            fprintf(stderr, "%d ", prompt_tokens[i]);
-        }
-        fprintf(stderr, "]\n");
-    }
-
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
        const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@ -243,8 +227,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;

-            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
-            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
+            wparams.initial_prompt   = params.prompt.c_str();

            whisper_print_user_data user_data = { &params, &pcmf32s };

@ -260,7 +243,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race

-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
@ -292,51 +275,64 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    return 0;
 }

-Napi::Object whisper(const Napi::CallbackInfo& info) {
-    Napi::Env env = info.Env();
-    if (info.Length() <= 0 || !info[0].IsObject()) {
-        Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
-    }
-    whisper_params params;
-    std::vector<std::vector<std::string>> result;
+class Worker : public Napi::AsyncWorker {
+ public:
+  Worker(Napi::Function& callback, whisper_params params)
+      : Napi::AsyncWorker(callback), params(params) {}

-    Napi::Object whisper_params = info[0].As<Napi::Object>();
-    std::string language = whisper_params.Get("language").As<Napi::String>();
-    std::string model = whisper_params.Get("model").As<Napi::String>();
-    std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
-
-    params.language = language;
-    params.model = model;
-    params.fname_inp.emplace_back(input);
-
-    // run model
+  void Execute() override {
    run(params, result);
+  }

-    fprintf(stderr, "RESULT:\n");
-    for (auto sentence:result) {
-        fprintf(stderr, "t0: %s, t1: %s, content: %s \n",
-                sentence[0].c_str(), sentence[1].c_str(), sentence[2].c_str());
-    }
-
-    Napi::Object res = Napi::Array::New(env, result.size());
+  void OnOK() override {
+    Napi::HandleScope scope(Env());
+    Napi::Object res = Napi::Array::New(Env(), result.size());
    for (uint64_t i = 0; i < result.size(); ++i) {
-        Napi::Object tmp = Napi::Array::New(env, 3);
-        for (uint64_t j = 0; j < 3; ++j) {
-            tmp[j] = Napi::String::New(env, result[i][j]);
-        }
-        res[i] = tmp;
+      Napi::Object tmp = Napi::Array::New(Env(), 3);
+      for (uint64_t j = 0; j < 3; ++j) {
+        tmp[j] = Napi::String::New(Env(), result[i][j]);
+      }
+      res[i] = tmp;
    }
+    Callback().Call({Env().Null(), res});
+  }

-    return res;
+ private:
+  whisper_params params;
+  std::vector<std::vector<std::string>> result;
+};
+
+
+
+Napi::Value whisper(const Napi::CallbackInfo& info) {
+  Napi::Env env = info.Env();
+  if (info.Length() <= 0 || !info[0].IsObject()) {
+    Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
+  }
+  whisper_params params;
+
+  Napi::Object whisper_params = info[0].As<Napi::Object>();
+  std::string language = whisper_params.Get("language").As<Napi::String>();
+  std::string model = whisper_params.Get("model").As<Napi::String>();
+  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
+
+  params.language = language;
+  params.model = model;
+  params.fname_inp.emplace_back(input);
+
+  Napi::Function callback = info[1].As<Napi::Function>();
+  Worker* worker = new Worker(callback, params);
+  worker->Queue();
+  return env.Undefined();
 }


 Napi::Object Init(Napi::Env env, Napi::Object exports) {
-    exports.Set(
-            Napi::String::New(env, "whisper"),
-            Napi::Function::New(env, whisper)
-    );
-    return exports;
+  exports.Set(
+      Napi::String::New(env, "whisper"),
+      Napi::Function::New(env, whisper)
+  );
+  return exports;
 }

 NODE_API_MODULE(whisper, Init);
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -1,27 +1,36 @@
-const path = require('path');
-const { whisper } = require(path.join(__dirname, '../../build/Release/whisper-addon'));
+const path = require("path");
+const { whisper } = require(path.join(
+  __dirname,
+  "../../build/Release/whisper-addon"
+));
+const { promisify } = require("util");
+
+const whisperAsync = promisify(whisper);

 const whisperParams = {
-    language: 'en',
-    model: path.join(__dirname, '../../models/ggml-base.en.bin'),
-    fname_inp: '',
+  language: "en",
+  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
+  fname_inp: "../../samples/jfk.wav",
 };

 const arguments = process.argv.slice(2);
 const params = Object.fromEntries(
-    arguments.reduce((pre, item) => {
-        if (item.startsWith("--")) {
-            return [...pre, item.slice(2).split("=")];
-        }
-        return pre;
-    }, []),
+  arguments.reduce((pre, item) => {
+    if (item.startsWith("--")) {
+      return [...pre, item.slice(2).split("=")];
+    }
+    return pre;
+  }, [])
 );

 for (const key in params) {
-    if (whisperParams.hasOwnProperty(key)) {
-        whisperParams[key] = params[key];
-    }
+  if (whisperParams.hasOwnProperty(key)) {
+    whisperParams[key] = params[key];
+  }
 }

-console.log('whisperParams =', whisperParams);
-console.log(whisper(whisperParams));
+console.log("whisperParams =", whisperParams);
+
+whisperAsync(whisperParams).then((result) => {
+  console.log(`Result from whisper: ${result}`);
+});
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -31,6 +31,7 @@ options:
  -osrt,     --output-srt        [false  ] output result in a srt file
  -owts,     --output-words      [false  ] output script for generating karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
+  -oj,       --output-json       [false  ] output result in a JSON file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -8,6 +8,7 @@
 #include <string>
 #include <thread>
 #include <vector>
+#include <cstring>

 // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
 // Lowest is red, middle is yellow, highest is green.
@ -73,6 +74,7 @@ struct whisper_params {
    bool output_srt     = false;
    bool output_wts     = false;
    bool output_csv     = false;
+    bool output_jsn     = false;
    bool print_special  = false;
    bool print_colors   = false;
    bool print_progress = false;
@ -80,6 +82,7 @@ struct whisper_params {

    std::string language = "en";
    std::string prompt;
+    std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
    std::string model    = "models/ggml-base.en.bin";

    std::vector<std::string> fname_inp = {};
@ -127,7 +130,9 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
        else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
+        else if (arg == "-fp"   || arg == "--font-path")      { params.font_path      = argv[++i]; }
        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
+        else if (arg == "-oj"   || arg == "--output-json")    { params.output_jsn     = true; }
        else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
@ -174,7 +179,9 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
+    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
+    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
@ -193,7 +200,7 @@ struct whisper_print_user_data {
    const std::vector<std::vector<float>> * pcmf32s;
 };

-void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
+void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

@ -352,28 +359,192 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

    const int n_segments = whisper_full_n_segments(ctx);
+    fout << "start,end,text\n";
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);

        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text    << "\"\n";
+        fout << 10 * t0 << "," << 10 * t1 << ",\"" << text    << "\"\n";
    }

    return true;
 }

+char *escape_double_quotes(const char *str) {
+    if (str == NULL) {
+        return NULL;
+    }
+
+    size_t escaped_length = strlen(str) + 1;
+
+    for (size_t i = 0; str[i] != '\0'; i++) {
+        if (str[i] == '"') {
+            escaped_length++;
+        }
+    }
+
+    char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
+    if (escaped == NULL) {
+        return NULL;
+    }
+
+    size_t pos = 0;
+    for (size_t i = 0; str[i] != '\0'; i++) {
+        if (str[i] == '"') {
+            escaped[pos++] = '\\';
+            escaped[pos++] = '"';
+        } else {
+            escaped[pos++] = str[i];
+        }
+    }
+
+    // no need to set zero due to calloc() being used prior
+
+    return escaped;
+}
+
+bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
+    std::ofstream fout(fname);
+    int indent = 0;
+
+    auto doindent = [&]() {
+        for (int i = 0; i < indent; i++) fout << "\t";
+    };
+
+    auto start_arr = [&](const char *name) {
+        doindent();
+        fout << "\"" << name << "\": [\n";
+        indent++;
+    };
+
+    auto end_arr = [&](bool end = false) {
+        indent--;
+        doindent();
+        fout << (end ? "]\n" : "},\n");
+    };
+
+    auto start_obj = [&](const char *name = nullptr) {
+        doindent();
+        if (name) {
+            fout << "\"" << name << "\": {\n";
+        } else {
+            fout << "{\n";
+        }
+        indent++;
+    };
+
+    auto end_obj = [&](bool end = false) {
+        indent--;
+        doindent();
+        fout << (end ? "}\n" : "},\n");
+    };
+
+    auto start_value = [&](const char *name) {
+        doindent();
+        fout << "\"" << name << "\": ";
+    };
+
+    auto value_s = [&](const char *name, const char *val, bool end = false) {
+        start_value(name);
+        char * val_escaped = escape_double_quotes(val);
+        fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
+        free(val_escaped);
+    };
+
+    auto end_value = [&](bool end = false) {
+        fout << (end ? "\n" : ",\n");
+    };
+
+    auto value_i = [&](const char *name, const int64_t val, bool end = false) {
+        start_value(name);
+        fout << val;
+        end_value(end);
+    };
+
+    auto value_b = [&](const char *name, const bool val, bool end = false) {
+        start_value(name);
+        fout << (val ? "true" : "false");
+        end_value(end);
+    };
+
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+    start_obj();
+        value_s("systeminfo", whisper_print_system_info());
+        start_obj("model");
+            value_s("type", whisper_model_type_readable(ctx));
+            value_b("multilingual", whisper_is_multilingual(ctx));
+            value_i("vocab", whisper_model_n_vocab(ctx));
+            start_obj("audio");
+                value_i("ctx", whisper_model_n_audio_ctx(ctx));
+                value_i("state", whisper_model_n_audio_state(ctx));
+                value_i("head", whisper_model_n_audio_head(ctx));
+                value_i("layer", whisper_model_n_audio_layer(ctx), true);
+            end_obj();
+            start_obj("text");
+                value_i("ctx", whisper_model_n_text_ctx(ctx));
+                value_i("state", whisper_model_n_text_state(ctx));
+                value_i("head", whisper_model_n_text_head(ctx));
+                value_i("layer", whisper_model_n_text_layer(ctx), true);
+            end_obj();
+            value_i("mels", whisper_model_n_mels(ctx));
+            value_i("f16", whisper_model_f16(ctx), true);
+        end_obj();
+        start_obj("params");
+            value_s("model", params.model.c_str());
+            value_s("language", params.language.c_str());
+            value_b("translate", params.translate, true);
+        end_obj();
+        start_obj("result");
+            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
+        end_obj();
+        start_arr("transcription");
+
+            const int n_segments = whisper_full_n_segments(ctx);
+            for (int i = 0; i < n_segments; ++i) {
+                const char * text = whisper_full_get_segment_text(ctx, i);
+                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+                start_obj();
+                    start_obj("timestamps");
+                        value_s("from", to_timestamp(t0, true).c_str());
+                        value_s("to", to_timestamp(t1, true).c_str(), true);
+                    end_obj();
+                    start_obj("offsets");
+                        value_i("from", t0 * 10);
+                        value_i("to", t1 * 10, true);
+                    end_obj();
+                    value_s("text", text, true);
+                end_obj(i == (n_segments - 1));
+            }
+
+        end_arr(true);
+    end_obj(true);
+    return true;
+}
+
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
+bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
    std::ofstream fout(fname);

    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);

-    // TODO: become parameter
-    static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
+    static const char * font = params.font_path.c_str();
+
+    std::ifstream fin(font);
+    if (!fin.is_open()) {
+        fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
+        return false;
+    }

    fout << "#!/bin/bash" << "\n";
    fout << "\n";
@ -504,22 +675,6 @@ int main(int argc, char ** argv) {
        return 3;
    }

-    // initial prompt
-    std::vector<whisper_token> prompt_tokens;
-
-    if (!params.prompt.empty()) {
-        prompt_tokens.resize(1024);
-        prompt_tokens.resize(whisper_tokenize(ctx, params.prompt.c_str(), prompt_tokens.data(), prompt_tokens.size()));
-
-        fprintf(stderr, "\n");
-        fprintf(stderr, "initial prompt: '%s'\n", params.prompt.c_str());
-        fprintf(stderr, "initial tokens: [ ");
-        for (int i = 0; i < (int) prompt_tokens.size(); ++i) {
-            fprintf(stderr, "%d ", prompt_tokens[i]);
-        }
-        fprintf(stderr, "]\n");
-    }
-
    for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
        const auto fname_inp = params.fname_inp[f];
 		const auto fname_out = f < (int) params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
@ -583,8 +738,7 @@ int main(int argc, char ** argv) {

            wparams.speed_up         = params.speed_up;

-            wparams.prompt_tokens     = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
-            wparams.prompt_n_tokens   = prompt_tokens.empty() ? 0       : prompt_tokens.size();
+            wparams.initial_prompt   = params.prompt.c_str();

            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;
@ -607,7 +761,7 @@ int main(int argc, char ** argv) {
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race

-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
@ -653,6 +807,12 @@ int main(int argc, char ** argv) {
                const auto fname_csv = fname_out + ".csv";
                output_csv(ctx, fname_csv.c_str());
            }
+
+            // output to JSON file
+            if (params.output_jsn) {
+                const auto fname_jsn = fname_out + ".json";
+                output_json(ctx, fname_jsn.c_str(), params);
+            }
        }
    }

--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -288,7 +288,6 @@ int main(int argc, char ** argv) {
            wparams.print_realtime   = false;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.translate        = params.translate;
-            wparams.no_context       = true;
            wparams.single_segment   = !use_vad;
            wparams.max_tokens       = params.max_tokens;
            wparams.language         = params.language.c_str();
--- a/examples/talk-llama/.gitignore
+++ b/examples/talk-llama/.gitignore
@ -0,0 +1,2 @@
+eleven-labs.py
+audio.mp3
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@ -0,0 +1,16 @@
+if (WHISPER_SUPPORT_SDL2)
+    # talk-llama
+    set(TARGET talk-llama)
+    #add_executable(${TARGET} talk-llama.cpp llama.cpp)
+    #target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    #target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+
+    # TODO: this is temporary
+    #       need to export ggml symbols for MSVC, but too lazy ..
+    add_executable(${TARGET} talk-llama.cpp llama.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
+
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
+    target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+
+    include(DefaultTargetOptions)
+endif ()
--- a/examples/talk-llama/README.md
+++ b/examples/talk-llama/README.md
@ -0,0 +1,36 @@
+# talk-llama
+
+Talk with an LLaMA AI in your terminal
+
+[Demo Talk](https://user-images.githubusercontent.com/1991296/228024237-848f998c-c334-46a6-bef8-3271590da83b.mp4)
+
+## Building
+
+The `talk-llama` tool depends on SDL2 library to capture audio from the microphone. You can build it like this:
+
+```bash
+# Install SDL2 on Linux
+sudo apt-get install libsdl2-dev
+
+# Install SDL2 on Mac OS
+brew install sdl2
+
+# Build the "talk-llama" executable
+make talk-llama
+
+# Run it
+./talk-llama -mw ./models/ggml-small.en.bin -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin -p "Georgi" -t 8
+```
+
+- The `-mw` argument specifies the Whisper model that you would like to use. Recommended `base` or `small` for real-time experience
+- The `-ml` argument specifies the LLaMA model that you would like to use. Read the instructions in https://github.com/ggerganov/llama.cpp for information about how to obtain a `ggml` compatible LLaMA model
+
+## TTS
+
+For best experience, this example needs a TTS tool to convert the generated text responses to voice.
+You can use any TTS engine that you would like - simply edit the [speak.sh](speak.sh) script to your needs.
+By default, it is configured to use MacOS's `say`, but you can use whatever you wish.
+
+## Discussion
+
+If you have any feedback, please let "us" know in the following discussion: https://github.com/ggerganov/whisper.cpp/discussions/672?converting=1
--- a/examples/talk-llama/llama.cpp
+++ b/examples/talk-llama/llama.cpp
--- a/examples/talk-llama/llama.h
+++ b/examples/talk-llama/llama.h
@ -0,0 +1,152 @@
+#ifndef LLAMA_H
+#define LLAMA_H
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#ifdef LLAMA_SHARED
+#    if defined(_WIN32) && !defined(__MINGW32__)
+#        ifdef LLAMA_BUILD
+#            define LLAMA_API __declspec(dllexport)
+#        else
+#            define LLAMA_API __declspec(dllimport)
+#        endif
+#    else
+#        define LLAMA_API __attribute__ ((visibility ("default")))
+#    endif
+#else
+#    define LLAMA_API
+#endif
+
+#define LLAMA_FILE_VERSION 1
+#define LLAMA_FILE_MAGIC 0x67676a74 // 'ggjt' in hex
+#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    //
+    // C interface
+    //
+    // TODO: show sample usage
+    //
+
+    struct llama_context;
+
+    typedef int llama_token;
+
+    typedef struct llama_token_data {
+        llama_token id;  // token id
+
+        float p;     // probability of the token
+        float plog;  // log probability of the token
+
+    } llama_token_data;
+
+    typedef void (*llama_progress_callback)(float progress, void *ctx);
+
+    struct llama_context_params {
+        int n_ctx;   // text context
+        int n_parts; // -1 for default
+        int seed;    // RNG seed, 0 for random
+
+        bool f16_kv;     // use fp16 for KV cache
+        bool logits_all; // the llama_eval() call computes all logits, not just the last one
+        bool vocab_only; // only load the vocabulary, no weights
+        bool use_mlock;  // force system to keep model in RAM
+        bool embedding;  // embedding mode only
+
+        // called with a progress value between 0 and 1, pass NULL to disable
+        llama_progress_callback progress_callback;
+        // context pointer passed to the progress callback
+        void * progress_callback_user_data;
+    };
+
+    LLAMA_API struct llama_context_params llama_context_default_params();
+
+    // Various functions for loading a ggml llama model.
+    // Allocate (almost) all memory needed for the model.
+    // Return NULL on failure
+    LLAMA_API struct llama_context * llama_init_from_file(
+                             const char * path_model,
+            struct llama_context_params   params);
+
+    // Frees all allocated memory
+    LLAMA_API void llama_free(struct llama_context * ctx);
+
+    // TODO: not great API - very likely to change
+    // Returns 0 on success
+    LLAMA_API int llama_model_quantize(
+            const char * fname_inp,
+            const char * fname_out,
+                   int   itype);
+
+    // Run the llama inference to obtain the logits and probabilities for the next token.
+    // tokens + n_tokens is the provided batch of new tokens to process
+    // n_past is the number of tokens to use from previous eval calls
+    // Returns 0 on success
+    LLAMA_API int llama_eval(
+            struct llama_context * ctx,
+               const llama_token * tokens,
+                             int   n_tokens,
+                             int   n_past,
+                             int   n_threads);
+
+    // Convert the provided text into tokens.
+    // The tokens pointer must be large enough to hold the resulting tokens.
+    // Returns the number of tokens on success, no more than n_max_tokens
+    // Returns a negative number on failure - the number of tokens that would have been returned
+    // TODO: not sure if correct
+    LLAMA_API int llama_tokenize(
+            struct llama_context * ctx,
+                      const char * text,
+                     llama_token * tokens,
+                             int   n_max_tokens,
+                            bool   add_bos);
+
+    LLAMA_API int llama_n_vocab(struct llama_context * ctx);
+    LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
+    LLAMA_API int llama_n_embd (struct llama_context * ctx);
+
+    // Token logits obtained from the last call to llama_eval()
+    // The logits for the last token are stored in the last row
+    // Can be mutated in order to change the probabilities of the next token
+    // Rows: n_tokens
+    // Cols: n_vocab
+    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
+
+    // Get the embeddings for the input
+    // shape: [n_embd] (1-dimensional)
+    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
+
+    // Token Id -> String. Uses the vocabulary in the provided context
+    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
+
+    // Special tokens
+    LLAMA_API llama_token llama_token_bos();
+    LLAMA_API llama_token llama_token_eos();
+
+    // TODO: improve the last_n_tokens interface ?
+    LLAMA_API llama_token llama_sample_top_p_top_k(
+       struct llama_context * ctx,
+          const llama_token * last_n_tokens_data,
+                        int   last_n_tokens_size,
+                        int   top_k,
+                      float   top_p,
+                      float   temp,
+                      float   repeat_penalty);
+
+    // Performance information
+    LLAMA_API void llama_print_timings(struct llama_context * ctx);
+    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
+
+    // Print system information
+    LLAMA_API const char * llama_print_system_info(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/examples/talk-llama/prompts/talk-alpaca.txt
+++ b/examples/talk-llama/prompts/talk-alpaca.txt
@ -0,0 +1,23 @@
+Below is an instruction that describes a task. Write a response that appropriately completes the request.
+
+### Instruction:
+
+Write a text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
+{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
+There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+{1} responds with short and concise answers.
+
+### Response:
+
+{0}{4} Hello, {1}!
+{1}{4} Hello {0}! How may I help you today?
+{0}{4} What time is it?
+{1}{4} It is {2} o'clock.
+{0}{4} What year is it?
+{1}{4} We are in {3}.
+{0}{4} What is a cat?
+{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
+{0}{4} Name a color.
+{1}{4} Blue
+{0}{4}
--- a/examples/talk-llama/speak.sh
+++ b/examples/talk-llama/speak.sh
@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Usage:
+#  speak.sh <voice_id> <text-to-speak>
+
+# espeak
+# Mac OS: brew install espeak
+# Linux: apt-get install espeak
+#
+#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
+
+# for Mac
+if [ "$1" = "0" ]; then
+    say "$2"
+elif [ "$1" = "1" ]; then
+    say -v "Samantha (Enhanced)" "$2"
+elif [ "$1" = "2" ]; then
+    say -v "Daniel (Enhanced)" "$2"
+elif [ "$1" = "3" ]; then
+    say -v "Veena (Enhanced)" "$2"
+fi
+
+# Eleven Labs
+#
+#wd=$(dirname $0)
+#script=$wd/eleven-labs.py
+#python3 $script $1 "$2" >/dev/null 2>&1
+#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -0,0 +1,703 @@
+// Talk with AI
+//
+
+#include "common.h"
+#include "common-sdl.h"
+#include "whisper.h"
+#include "llama.h"
+
+#include <map>
+#include <cassert>
+#include <cstdio>
+#include <fstream>
+#include <regex>
+#include <string>
+#include <thread>
+#include <vector>
+#include <regex>
+
+std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
+    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
+    std::vector<llama_token> res(text.size() + (int)add_bos);
+    int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
+    assert(n >= 0);
+    res.resize(n);
+
+    return res;
+}
+
+// command-line parameters
+struct whisper_params {
+    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t voice_id   = 0;
+    int32_t voice_ms   = 10000;
+    int32_t capture_id = -1;
+    int32_t max_tokens = 64;
+    int32_t audio_ctx  = 0;
+
+    int32_t n_parts_llama = -1;
+
+    float vad_thold    = 0.4f;
+    float freq_thold   = 100.0f;
+
+    bool speed_up      = false;
+    bool translate     = false;
+    bool print_special = false;
+    bool print_energy  = false;
+    bool no_timestamps = true;
+    bool verbose_prompt = false;
+
+    std::string name_ni     = "Georgi"; // natural    intelligence
+    std::string name_ai     = "LLaMA";  // artificial intelligence
+    std::string language    = "en";
+    std::string model_wsp   = "models/ggml-base.en.bin";
+    std::string model_llama = "models/ggml-llama-7B.bin";
+    std::string speak       = "./examples/talk/speak.sh";
+    std::string prompt      = "";
+    std::string fname_out;
+};
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads      = std::stoi(argv[++i]); }
+        else if (arg == "-vid" || arg == "--voice-id")      { params.voice_id       = std::stoi(argv[++i]); }
+        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms       = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id     = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens     = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx      = std::stoi(argv[++i]); }
+        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold      = std::stof(argv[++i]); }
+        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold     = std::stof(argv[++i]); }
+        else if (arg == "--n-parts-llama")                  { params.n_parts_llama  = std::stoi(argv[++i]); }
+        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up       = true; }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate      = true; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special  = true; }
+        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy   = true; }
+        else if (arg == "--verbose-prompt")                 { params.verbose_prompt = true; }
+        else if (arg == "-nni" || arg == "--name-ni")       { params.name_ni        = argv[++i]; }
+        else if (arg == "-nai" || arg == "--name-ai")       { params.name_ai        = argv[++i]; }
+        else if (arg == "-l"   || arg == "--language")      { params.language       = argv[++i]; }
+        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp      = argv[++i]; }
+        else if (arg == "-ml"  || arg == "--model-llama")   { params.model_llama    = argv[++i]; }
+        else if (arg == "-s"   || arg == "--speak")         { params.speak          = argv[++i]; }
+        else if (arg == "--prompt-file")                    {
+            std::ifstream file(argv[++i]);
+            std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(params.prompt));
+            if (params.prompt.back() == '\n') {
+                params.prompt.pop_back();
+            }
+        }
+        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
+        else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "  -vid N,   --voice-id N    [%-7d] voice ID\n",                                    params.voice_id);
+    fprintf(stderr, "  -vms N,   --voice-ms N    [%-7d] voice duration in milliseconds\n",              params.voice_ms);
+    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
+    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
+    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
+    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
+    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
+    fprintf(stderr, "  -nni NAME,--name-ni NAME  [%-7s] natural intelligence name\n",                   params.name_ni.c_str());
+    fprintf(stderr, "  -nai NAME,--name-ai NAME  [%-7s] artificial intelligence name\n",                params.name_ai.c_str());
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
+    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
+    fprintf(stderr, "  -ml FILE, --model-llama   [%-7s] llama model file\n",                            params.model_llama.c_str());
+    fprintf(stderr, "  --n-parts-llama N         [%-7d] num parts in llama model file\n",               params.n_parts_llama);
+    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
+    fprintf(stderr, "  --prompt-file FNAME       [%-7s] file with custom prompt to start dialog\n",     "");
+    fprintf(stderr, "  --verbose-prompt          [%-7s] print prompt at start\n",                       params.verbose_prompt ? "true" : "false");
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
+    fprintf(stderr, "\n");
+}
+
+std::string transcribe(
+        whisper_context * ctx,
+        const whisper_params & params,
+        const std::vector<float> & pcmf32,
+        const std::string prompt_text,
+        float & prob,
+        int64_t & t_ms) {
+    const auto t_start = std::chrono::high_resolution_clock::now();
+
+    prob = 0.0f;
+    t_ms = 0;
+
+    std::vector<whisper_token> prompt_tokens;
+
+    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+    prompt_tokens.resize(1024);
+    prompt_tokens.resize(whisper_tokenize(ctx, prompt_text.c_str(), prompt_tokens.data(), prompt_tokens.size()));
+
+    wparams.print_progress   = false;
+    wparams.print_special    = params.print_special;
+    wparams.print_realtime   = false;
+    wparams.print_timestamps = !params.no_timestamps;
+    wparams.translate        = params.translate;
+    wparams.no_context       = true;
+    wparams.single_segment   = true;
+    wparams.max_tokens       = params.max_tokens;
+    wparams.language         = params.language.c_str();
+    wparams.n_threads        = 2;
+
+    wparams.prompt_tokens    = prompt_tokens.empty() ? nullptr : prompt_tokens.data();
+    wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();
+
+    wparams.audio_ctx        = params.audio_ctx;
+    wparams.speed_up         = params.speed_up;
+
+    static int iter = params.voice_id;
+    std::this_thread::sleep_for(std::chrono::milliseconds(100*iter));
+    iter = (iter + 1) % 4;
+
+    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+        return "";
+    }
+
+    int prob_n = 0;
+    std::string result;
+
+    const int n_segments = whisper_full_n_segments(ctx);
+    for (int i = 0; i < n_segments; ++i) {
+        const char * text = whisper_full_get_segment_text(ctx, i);
+
+        result += text;
+
+        const int n_tokens = whisper_full_n_tokens(ctx, i);
+        for (int j = 0; j < n_tokens; ++j) {
+            const auto token = whisper_full_get_token_data(ctx, i, j);
+
+            prob += token.p;
+            ++prob_n;
+        }
+    }
+
+    if (prob_n > 0) {
+        prob /= prob_n;
+    }
+
+    const auto t_end = std::chrono::high_resolution_clock::now();
+    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
+
+    return result;
+}
+
+const std::vector<std::string> k_participants = {
+    "LLaMA",
+    "GGaMA",
+    "SSaMA",
+    "RRaMA",
+};
+
+// homophones
+const std::map<std::string, std::vector<std::string>> k_homophones = {
+    { "LLaMA", { "llama", "Llama", "LLAMA", }, },
+    { "GGaMA", { "gama", "Gama", "GAMA", "gamma", "Gamma", "GAMMA", }, },
+    { "SSaMA", { "sama", "Sama", "SAMA", "samma", "Samma", "SAMMA", }, },
+    { "RRaMA", { "rama", "Rama", "RAMA", "ramma", "Ramma", "RAMMA", }, },
+};
+
+const std::string k_prompt_whisper = R"(A conversation between {1}, {10}, {11}, {12} and {13}.)";
+
+const std::map<std::string, std::string> k_prompt = {
+    {
+        k_participants.at(0),
+        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
+There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+{10}, {11}, {12} and {13} respond with short and concise answers.
+{10} is smart, objective, honest and kind. Never fails to give a meaningful and insightful answer and opinion.
+{1} is leading the conversation and asking the questions.
+
+{1}{4} Hello {10}! What is your opinion on the current state of the world?
+{10}{4} Great question {1}! I think we live in a very interesting time.
+There are many things to be concerned about, but also many things to be optimistic about.
+{1}{4} What advice would you give to a young person who is just starting out in life?
+{10}{4} I would tell them to be patient and to not be afraid to fail.
+It is important to learn from your mistakes and to keep trying.
+{1}{4})"
+    },
+    {
+        k_participants.at(1),
+        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
+There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+{10}, {11}, {12} and {13} respond with short and concise answers.
+{11} has critical thinking skills, is very knowledgeable and is a good listener. He is very humble and never arrogant.
+{1} is leading the conversation and asking the questions.
+
+{1}{4} Hello {11}! What is your opinion on the current state of the world?
+{11}{4} The world is about to experience a major change. We are on the verge of a new era.
+{1}{4} What advice would you give to a young person who is just starting out in life?
+{11}{4} My advice would be to be open minded and to be willing to learn from others.
+{1}{4})"
+    },
+    {
+        k_participants.at(2),
+        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
+There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+{10}, {11}, {12} and {13} respond with short and concise answers.
+{12} has strong leadership skills, strategic thinking, and innovative ideas. Has the ability to mentor and support young people.
+{1} is leading the conversation and asking the questions.
+
+{1}{4} Hello {12}! What is your opinion on the current state of the world?
+{12}{4} Our future is bright. We are living in a time of great opportunity.
+{1}{4} What advice would you give to a young person who is just starting out in life?
+{12}{4} I would tell them to be brave and to be willing to take risks.
+{1}{4})"
+    },
+    {
+        k_participants.at(3),
+        R"(Text transcript of a never ending dialog, between {1}, {10}, {11}, {12} and {13}.
+There are no annotations like (30 seconds passed...) or (to himself), just what the participants say aloud to each other.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+{10}, {11}, {12} and {13} respond with short and concise answers.
+{13} is rude, arrogant, and has a bad attitude. He is very opinionated and never listens to others.
+{1} is leading the conversation and asking the questions.
+
+{1}{4} Hello {13}! What is your opinion on the current state of the world?
+{13}{4} The world is a terrible place. It is full of evil and corruption.
+{1}{4} What advice would you give to a young person who is just starting out in life?
+{13}{4} I would tell them to be selfish and to never trust anyone.
+{1}{4})"
+    },
+};
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
+    // whisper init
+
+    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
+
+    // llama init
+
+    auto lparams = llama_context_default_params();
+
+    // tune these to your liking
+    lparams.n_ctx      = 512;
+    lparams.seed       = 1;
+    lparams.f16_kv     = true;
+    lparams.n_parts    = params.n_parts_llama;
+
+    struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
+
+    // print some info about the processing
+    {
+        fprintf(stderr, "\n");
+
+        if (!whisper_is_multilingual(ctx_wsp)) {
+            if (params.language != "en" || params.translate) {
+                params.language = "en";
+                params.translate = false;
+                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+            }
+        }
+        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+                __func__,
+                params.n_threads,
+                params.language.c_str(),
+                params.translate ? "translate" : "transcribe",
+                params.no_timestamps ? 0 : 1);
+
+        fprintf(stderr, "\n");
+    }
+
+
+    // init audio
+
+    audio_async audio(30*1000);
+    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
+        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
+        return 1;
+    }
+
+    audio.resume();
+
+    int n_iter = 0;
+
+    bool is_running  = true;
+    bool force_speak = false;
+
+    float prob0 = 0.0f;
+
+    const std::string chat_symb = ":";
+
+    const std::string name_ni  = params.name_ni;
+    const std::string name_ai  = params.name_ai;
+
+    // the participant that was referenced last
+    std::string name_ref = name_ni;
+
+    std::vector<float> pcmf32_cur;
+    std::vector<float> pcmf32_prompt;
+
+    std::string prompt_whisper = k_prompt_whisper;
+
+    prompt_whisper = ::replace(prompt_whisper, "{1}",  name_ni);
+    prompt_whisper = ::replace(prompt_whisper, "{10}", k_participants.at(0));
+    prompt_whisper = ::replace(prompt_whisper, "{11}", k_participants.at(1));
+    prompt_whisper = ::replace(prompt_whisper, "{12}", k_participants.at(2));
+    prompt_whisper = ::replace(prompt_whisper, "{13}", k_participants.at(3));
+
+    // construct the initial prompt for LLaMA inference
+    std::string prompt_llama = params.prompt.empty() ? k_prompt.find(name_ai)->second : params.prompt;
+
+    // need to have leading ' '
+    prompt_llama.insert(0, 1, ' ');
+
+    prompt_llama = ::replace(prompt_llama, "{1}",  name_ni);
+    prompt_llama = ::replace(prompt_llama, "{10}", k_participants.at(0));
+    prompt_llama = ::replace(prompt_llama, "{11}", k_participants.at(1));
+    prompt_llama = ::replace(prompt_llama, "{12}", k_participants.at(2));
+    prompt_llama = ::replace(prompt_llama, "{13}", k_participants.at(3));
+
+    {
+        // get date string
+        std::string date_str;
+        {
+            time_t t = time(0);
+            struct tm * now = localtime(&t);
+            char buf[128];
+            strftime(buf, sizeof(buf), "%d/%m/%Y", now);
+            date_str = buf;
+        }
+        prompt_llama = ::replace(prompt_llama, "{1}", date_str);
+    }
+
+    {
+        // get time string
+        std::string time_str;
+        {
+            time_t t = time(0);
+            struct tm * now = localtime(&t);
+            char buf[128];
+            strftime(buf, sizeof(buf), "%H:%M", now);
+            time_str = buf;
+        }
+        prompt_llama = ::replace(prompt_llama, "{2}", time_str);
+    }
+
+    {
+        // get year string
+        std::string year_str;
+        {
+            time_t t = time(0);
+            struct tm * now = localtime(&t);
+            char buf[128];
+            strftime(buf, sizeof(buf), "%Y", now);
+            year_str = buf;
+        }
+        prompt_llama = ::replace(prompt_llama, "{3}", year_str);
+    }
+
+    prompt_llama = ::replace(prompt_llama, "{4}", chat_symb);
+
+    // evaluate the initial prompt
+
+    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_llama, true);
+
+    printf("\n");
+    printf("%s : initializing - please wait ...\n", __func__);
+
+    if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0, params.n_threads)) {
+        fprintf(stderr, "%s : failed to eval\n", __func__);
+        return 1;
+    }
+
+    if (params.verbose_prompt) {
+        fprintf(stdout, "\n");
+        fprintf(stdout, "%s", prompt_whisper.c_str());
+        fprintf(stdout, "\n");
+
+        fprintf(stdout, "\n");
+        fprintf(stdout, "%s", prompt_llama.c_str());
+        fprintf(stdout, "\n");
+        fprintf(stdout, "\n");
+        fflush(stdout);
+    }
+
+    printf("%s : done! start speaking in the microphone\n", __func__);
+    printf("\n");
+    printf("%s%s", name_ni.c_str(), chat_symb.c_str());
+    fflush(stdout);
+
+    // clear audio buffer
+    audio.clear();
+
+    // text inference variables
+    const int voice_id = params.voice_id;
+    const int n_keep   = embd_inp.size();
+    const int n_ctx    = llama_n_ctx(ctx_llama);
+
+    int n_past = n_keep;
+    int n_prev = 64; // TODO arg
+
+    std::vector<llama_token> embd;
+
+    // reverse prompts for detecting when it's time to stop speaking
+    std::vector<std::string> antiprompts = {
+        name_ni + chat_symb,
+    };
+
+    for (const auto & p : k_participants) {
+        antiprompts.push_back(p + chat_symb);
+    }
+
+    std::string text_heard_all;
+
+    // main loop
+    while (is_running) {
+        // handle Ctrl + C
+        is_running = sdl_poll_events();
+
+        if (!is_running) {
+            break;
+        }
+
+        // delay
+        std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+        int64_t t_ms = 0;
+
+        {
+            audio.get(15000, pcmf32_cur);
+
+            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
+                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
+
+                audio.get(params.voice_ms, pcmf32_cur);
+
+                std::string text_heard;
+
+                if (!force_speak) {
+                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prompt_whisper, prob0, t_ms));
+                }
+
+                // remove text between brackets using regex
+                {
+                    std::regex re("\\[.*?\\]");
+                    text_heard = std::regex_replace(text_heard, re, "");
+                }
+
+                // remove text between brackets using regex
+                {
+                    std::regex re("\\(.*?\\)");
+                    text_heard = std::regex_replace(text_heard, re, "");
+                }
+
+                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
+                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
+
+                // take first line
+                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
+
+                // remove leading and trailing whitespace
+                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
+                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
+
+                const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
+
+                if (text_heard.empty() || tokens.empty() || force_speak) {
+                    //fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
+                    audio.clear();
+
+                    continue;
+                }
+
+                force_speak = false;
+
+                if (text_heard[0] != ' ') {
+                    text_heard.insert(0, 1, ' ');
+                }
+
+                // replace homophones
+                for (const auto & homophone : k_homophones) {
+                    for (const auto & word : homophone.second) {
+                        text_heard = ::replace(text_heard, word, homophone.first);
+                    }
+                }
+
+                // check which participant was mentioned
+                const auto name_ref_old = name_ref;
+                for (const auto & participant : k_participants) {
+                    if (participant == name_ref) {
+                        continue;
+                    }
+
+                    if (text_heard.find(participant) != std::string::npos) {
+                        name_ref = participant;
+                        break;
+                    }
+                }
+                if (name_ref == name_ref_old && name_ref != name_ai) {
+                    name_ref = name_ni;
+                }
+
+                text_heard += "\n" + name_ref + chat_symb;
+                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
+                fflush(stdout);
+
+                text_heard_all += text_heard;
+                // keep only last 100 characters
+                if (text_heard_all.size() > 100) {
+                    text_heard_all = text_heard_all.substr(text_heard_all.size() - 100);
+                }
+
+                if (name_ref != name_ai) {
+                } else {
+                    // text inference
+                    bool done = false;
+                    std::string text_to_speak;
+
+                    embd = ::llama_tokenize(ctx_llama, text_heard_all, false);
+                    text_heard_all.clear();
+
+                    while (true) {
+                        // predict
+                        if (embd.size() > 0) {
+                            if (n_past + (int) embd.size() > n_ctx) {
+                                n_past = n_keep;
+
+                                // insert n_left/2 tokens at the start of embd from last_n_tokens
+                                embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
+
+                                //printf("\n---\n");
+                                //printf("resetting: '");
+                                //for (int i = 0; i < (int) embd.size(); i++) {
+                                //    printf("%s", llama_token_to_str(ctx_llama, embd[i]));
+                                //}
+                                //printf("'\n");
+                                //printf("\n---\n");
+                            }
+
+                            if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
+                                fprintf(stderr, "%s : failed to eval\n", __func__);
+                                return 1;
+                            }
+                        }
+
+                        //printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
+
+                        embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
+                        n_past += embd.size();
+                        embd.clear();
+
+                        if (done) break;
+
+                        {
+                            // out of user input, sample next token
+                            const float top_k          = 5;
+                            const float top_p          = 0.80f;
+                            const float temp           = 0.20f;
+                            const float repeat_penalty = 1.0764f;
+
+                            const int repeat_last_n    = 256;
+
+                            llama_token id = 0;
+
+                            {
+                                auto logits = llama_get_logits(ctx_llama);
+                                logits[llama_token_eos()] = 0;
+
+                                id = llama_sample_top_p_top_k(ctx_llama,
+                                        embd_inp.data() + std::max(0, n_past - repeat_last_n),
+                                        repeat_last_n, top_k, top_p, temp, repeat_penalty);
+                            }
+
+                            if (id != llama_token_eos()) {
+                                // add it to the context
+                                embd.push_back(id);
+
+                                text_to_speak += llama_token_to_str(ctx_llama, id);
+
+                                printf("%s", llama_token_to_str(ctx_llama, id));
+                            }
+
+                            // new line
+                            if (id == 13) {
+                            }
+                        }
+
+                        {
+                            std::string last_output;
+                            for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
+                                last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
+                            }
+                            last_output += llama_token_to_str(ctx_llama, embd[0]);
+
+                            for (const std::string & antiprompt : antiprompts) {
+                                if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
+                                    done = true;
+                                    text_to_speak = ::replace(text_to_speak, antiprompt, "");
+                                    fflush(stdout);
+                                    break;
+                                }
+                            }
+                        }
+
+                        is_running = sdl_poll_events();
+
+                        if (!is_running) {
+                            break;
+                        }
+                    }
+
+                    text_to_speak = ::replace(text_to_speak, "\"", "");
+                    system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
+                }
+
+                audio.clear();
+
+                ++n_iter;
+            }
+        }
+    }
+
+    audio.pause();
+
+    whisper_print_timings(ctx_wsp);
+    whisper_free(ctx_wsp);
+
+    llama_print_timings(ctx_llama);
+    llama_free(ctx_llama);
+
+    return 0;
+}
--- a/examples/talk.wasm/gpt-2.cpp
+++ b/examples/talk.wasm/gpt-2.cpp
@ -325,9 +325,12 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = NULL;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ false,
+        };
+

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -528,9 +531,11 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);

--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -31,7 +31,7 @@ To run this, you will need a ggml GPT-2 model: [instructions](https://github.com
 Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:

 ```
-wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
+wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
 ```

 ## TTS
--- a/examples/talk/gpt-2.cpp
+++ b/examples/talk/gpt-2.cpp
@ -325,9 +325,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

    // create the ggml context
    {
-        struct ggml_init_params params;
-        params.mem_size   = ctx_size;
-        params.mem_buffer = nullptr;
+        struct ggml_init_params params = {
+            /*.mem_size   =*/ ctx_size,
+            /*.mem_buffer =*/ nullptr,
+            /*.no_alloc   =*/ false,
+        };

        model.ctx = ggml_init(params);
        if (!model.ctx) {
@ -528,9 +530,11 @@ bool gpt2_eval(
        }
    }

-    struct ggml_init_params params;
-    params.mem_size   = buf_size;
-    params.mem_buffer = buf;
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ buf_size,
+        /*.mem_buffer =*/ buf,
+        /*.no_alloc   =*/ false,
+    };

    struct ggml_context * ctx0 = ggml_init(params);

--- a/examples/talk/speak.sh
+++ b/examples/talk/speak.sh
@ -7,7 +7,10 @@
 # Mac OS: brew install espeak
 # Linux: apt-get install espeak
 #
-espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
+#espeak -v en-us+m$1 -s 175 -p 50 -a 200 -g 5 -k 5 "$2"
+
+# Mac OS "say" command
+say "$2"

 # Eleven Labs
 #
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -9,4 +9,4 @@ To use:
 5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
 [^1]: I recommend the tiny or base models for running on an Android device.

-<img width="300" alt="image" src="https://user-images.githubusercontent.com/1991296/208154256-82d972dc-221b-48c4-bfcb-36ce68602f93.png">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
@ -2,6 +2,7 @@ package com.whispercppdemo.ui.main

 import androidx.compose.foundation.layout.*
 import androidx.compose.foundation.rememberScrollState
+import androidx.compose.foundation.text.selection.SelectionContainer
 import androidx.compose.foundation.verticalScroll
 import androidx.compose.material3.*
 import androidx.compose.runtime.Composable
@ -19,6 +20,7 @@ fun MainScreen(viewModel: MainScreenViewModel) {
        canTranscribe = viewModel.canTranscribe,
        isRecording = viewModel.isRecording,
        messageLog = viewModel.dataLog,
+        onBenchmarkTapped = viewModel::benchmark,
        onTranscribeSampleTapped = viewModel::transcribeSample,
        onRecordTapped = viewModel::toggleRecord
    )
@ -30,6 +32,7 @@ private fun MainScreen(
    canTranscribe: Boolean,
    isRecording: Boolean,
    messageLog: String,
+    onBenchmarkTapped: () -> Unit,
    onTranscribeSampleTapped: () -> Unit,
    onRecordTapped: () -> Unit
 ) {
@ -45,8 +48,11 @@ private fun MainScreen(
                .padding(innerPadding)
                .padding(16.dp)
        ) {
-            Row(horizontalArrangement = Arrangement.SpaceBetween) {
-                TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
+            Column(verticalArrangement = Arrangement.SpaceBetween) {
+                Row(horizontalArrangement = Arrangement.SpaceBetween, modifier = Modifier.fillMaxWidth()) {
+                    BenchmarkButton(enabled = canTranscribe, onClick = onBenchmarkTapped)
+                    TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
+                }
                RecordButton(
                    enabled = canTranscribe,
                    isRecording = isRecording,
@ -60,7 +66,16 @@ private fun MainScreen(

@Composable
 private fun MessageLog(log: String) {
-    Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
+    SelectionContainer() {
+        Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
+    }
+}
+
+@Composable
+private fun BenchmarkButton(enabled: Boolean, onClick: () -> Unit) {
+    Button(onClick = onClick, enabled = enabled) {
+        Text("Benchmark")
+    }
 }

@Composable
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
@ -41,10 +41,15 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {

    init {
        viewModelScope.launch {
+            printSystemInfo()
            loadData()
        }
    }

+    private suspend fun printSystemInfo() {
+        printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
+    }
+
    private suspend fun loadData() {
        printMessage("Loading data...\n")
        try {
@ -81,10 +86,29 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        //whisperContext = WhisperContext.createContextFromFile(firstModel.absolutePath)
    }

+    fun benchmark() = viewModelScope.launch {
+        runBenchmark(6)
+    }
+
    fun transcribeSample() = viewModelScope.launch {
        transcribeAudio(getFirstSample())
    }

+    private suspend fun runBenchmark(nthreads: Int) {
+        if (!canTranscribe) {
+            return
+        }
+
+        canTranscribe = false
+
+        printMessage("Running benchmark. This will take minutes...\n")
+        whisperContext?.benchMemory(nthreads)?.let{ printMessage(it) }
+        printMessage("\n")
+        whisperContext?.benchGgmlMulMat(nthreads)?.let{ printMessage(it) }
+
+        canTranscribe = true
+    }
+
    private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) {
        samplesPath.listFiles()!!.first()
    }
@ -114,11 +138,14 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        canTranscribe = false

        try {
-            printMessage("Reading wave samples...\n")
+            printMessage("Reading wave samples... ")
            val data = readAudioSamples(file)
+            printMessage("${data.size / (16000 / 1000)} ms\n")
            printMessage("Transcribing data...\n")
+            val start = System.currentTimeMillis()
            val text = whisperContext?.transcribeData(data)
-            printMessage("Done: $text\n")
+            val elapsed = System.currentTimeMillis() - start
+            printMessage("Done ($elapsed ms): $text\n")
        } catch (e: Exception) {
            Log.w(LOG_TAG, e)
            printMessage("${e.localizedMessage}\n")
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
@ -27,6 +27,14 @@ class WhisperContext private constructor(private var ptr: Long) {
        }
    }

+    suspend fun benchMemory(nthreads: Int): String = withContext(scope.coroutineContext) {
+        return@withContext WhisperLib.benchMemcpy(nthreads)
+    }
+
+    suspend fun benchGgmlMulMat(nthreads: Int): String = withContext(scope.coroutineContext) {
+        return@withContext WhisperLib.benchGgmlMulMat(nthreads)
+    }
+
    suspend fun release() = withContext(scope.coroutineContext) {
        if (ptr != 0L) {
            WhisperLib.freeContext(ptr)
@ -66,6 +74,10 @@ class WhisperContext private constructor(private var ptr: Long) {
            }
            return WhisperContext(ptr)
        }
+
+        fun getSystemInfo(): String {
+            return WhisperLib.getSystemInfo()
+        }
    }
 }

@ -74,6 +86,7 @@ private class WhisperLib {
        init {
            Log.d(LOG_TAG, "Primary ABI: ${Build.SUPPORTED_ABIS[0]}")
            var loadVfpv4 = false
+            var loadV8fp16 = false
            if (isArmEabiV7a()) {
                // armeabi-v7a needs runtime detection support
                val cpuInfo = cpuInfo()
@ -84,11 +97,24 @@ private class WhisperLib {
                        loadVfpv4 = true
                    }
                }
+            } else if (isArmEabiV8a()) {
+                // ARMv8.2a needs runtime detection support
+                val cpuInfo = cpuInfo()
+                cpuInfo?.let {
+                    Log.d(LOG_TAG, "CPU info: $cpuInfo")
+                    if (cpuInfo.contains("fphp")) {
+                        Log.d(LOG_TAG, "CPU supports fp16 arithmetic")
+                        loadV8fp16 = true
+                    }
+                }
            }

            if (loadVfpv4) {
                Log.d(LOG_TAG, "Loading libwhisper_vfpv4.so")
                System.loadLibrary("whisper_vfpv4")
+            } else if (loadV8fp16) {
+                Log.d(LOG_TAG, "Loading libwhisper_v8fp16_va.so")
+                System.loadLibrary("whisper_v8fp16_va")
            } else {
                Log.d(LOG_TAG, "Loading libwhisper.so")
                System.loadLibrary("whisper")
@ -103,6 +129,9 @@ private class WhisperLib {
        external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
        external fun getTextSegmentCount(contextPtr: Long): Int
        external fun getTextSegment(contextPtr: Long, index: Int): String
+        external fun getSystemInfo(): String
+        external fun benchMemcpy(nthread: Int): String
+        external fun benchGgmlMulMat(nthread: Int): String
    }
 }

@ -110,6 +139,10 @@ private fun isArmEabiV7a(): Boolean {
    return Build.SUPPORTED_ABIS[0].equals("armeabi-v7a")
 }

+private fun isArmEabiV8a(): Boolean {
+    return Build.SUPPORTED_ABIS[0].equals("arm64-v8a")
+}
+
 private fun cpuInfo(): String? {
    return try {
        File("/proc/cpuinfo").inputStream().bufferedReader().use {
--- a/examples/whisper.android/app/src/main/jni/whisper/Android.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Android.mk
@ -12,4 +12,15 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
 	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
 	LOCAL_CFLAGS += -mfpu=neon-vfpv4
 	include $(BUILD_SHARED_LIBRARY)
-endif
+endif
+
+ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
+	include $(CLEAR_VARS)
+	LOCAL_MODULE    := libwhisper_v8fp16_va
+	include $(LOCAL_PATH)/Whisper.mk
+	# Allow building NEON FMA code.
+	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
+	LOCAL_CFLAGS += -march=armv8.2-a+fp16
+	include $(BUILD_SHARED_LIBRARY)
+endif
+
--- a/examples/whisper.android/app/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c
@ -6,6 +6,7 @@
 #include <sys/sysinfo.h>
 #include <string.h>
 #include "whisper.h"
+#include "ggml.h"

 #define UNUSED(x) (void)(x)
 #define TAG "JNI"
@ -213,4 +214,30 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
    const char *text = whisper_full_get_segment_text(context, index);
    jstring string = (*env)->NewStringUTF(env, text);
    return string;
-}
+}
+
+JNIEXPORT jstring JNICALL
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getSystemInfo(
+        JNIEnv *env, jobject thiz
+) {
+    UNUSED(thiz);
+    const char *sysinfo = whisper_print_system_info();
+    jstring string = (*env)->NewStringUTF(env, sysinfo);
+    return string;
+}
+
+JNIEXPORT jstring JNICALL
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
+                                                                      jint n_threads) {
+    UNUSED(thiz);
+    const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
+    jstring string = (*env)->NewStringUTF(env, bench_ggml_memcpy);
+}
+
+JNIEXPORT jstring JNICALL
+Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
+                                                                          jint n_threads) {
+    UNUSED(thiz);
+    const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
+    jstring string = (*env)->NewStringUTF(env, bench_ggml_mul_mat);
+}
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -24,3 +24,5 @@ Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Pha
 This can significantly improve the performance of the transcription:

 <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
+
+In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -296,6 +296,10 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
+				OTHER_CFLAGS = (
+					"-O3",
+					"-DNDEBUG",
+				);
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
 			};
--- a/examples/whisper.swiftui/README.md
+++ b/examples/whisper.swiftui/README.md
@ -1,14 +1,18 @@
 A sample SwiftUI app using [whisper.cpp](https://github.com/ggerganov/whisper.cpp/) to do voice-to-text transcriptions.
 See also: [whisper.objc](https://github.com/ggerganov/whisper.cpp/tree/master/examples/whisper.objc).

-To use:
+**Usage**:

 1. Select a model from the [whisper.cpp repository](https://github.com/ggerganov/whisper.cpp/tree/master/models).[^1]
-2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
+2. Add the model to `whisper.swiftui.demo/Resources/models` **via Xcode**.
 3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
-4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
-5. Select the "release" build configuration under "Run", then deploy and run to your device.
+4. Add the sample audio file to `whisper.swiftui.demo/Resources/samples` **via Xcode**.
+5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device.
+
+**Note:** Pay attention to the folder path: `whisper.swiftui.demo/Resources/models` is the appropriate directory to place resources whilst `whisper.swiftui.demo/Models` is related to actual code.

 [^1]: I recommend the tiny, base or small models for running on an iOS device.

+[^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
+
 ![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -430,6 +430,10 @@
 				LLVM_LTO = YES;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MARKETING_VERSION = 1.0;
+				OTHER_CFLAGS = (
+					"-O3",
+					"-DNDEBUG",
+				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
--- a/extra/bench-wts.sh
+++ b/extra/bench-wts.sh
@ -0,0 +1,70 @@
+# Benchmark word-level timestamps for different models
+#
+# This script takes two arguments
+# - an audio file
+# - [optional] path to a font file
+
+# I'm using "/usr/share/fonts/truetype/freefont/FreeMono.ttf" on Ubuntu
+
+if [ -z "$1" ]; then
+    echo "Usage: $0 <audio file> [font file]"
+    exit 1
+fi
+
+#TODO: Make this a command line parameter
+#models="base small large"
+#models="tiny.en tiny base.en base small.en small medium.en medium large-v1 large"
+models="tiny.en base.en small.en medium.en large"
+
+DURATION=$(ffprobe -i $1 -show_entries format=duration -v quiet -of csv="p=0")
+DURATION=$(printf "%.2f" $DURATION)
+echo "Input file duration: ${DURATION}s"
+
+for model in $models; do
+    echo "Running $model"
+    COMMAND="./main -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
+
+    if [ ! -z "$2" ]; then
+        COMMAND="$COMMAND -fp $2"
+    fi
+    #TODO: Surface errors better
+    # TIMEFMT is for zsh, TIMEFORMAT is for bash
+    EXECTIME=$({ TIMEFMT="%E";TIMEFORMAT=%E; time $COMMAND >/dev/null 2>&1; } 2>&1)
+
+    # Slightly different formats between zsh and bash
+    if [ "${EXECTIME: -1}" == "s" ]; then
+        EXECTIME=${EXECTIME::-1}
+    fi
+
+    RATIO=$(echo "$DURATION / $EXECTIME" | bc -l)
+    RATIO=$(printf "%.2f" $RATIO)
+
+    echo "Execution time: ${EXECTIME}s (${RATIO}x realtime)"
+
+    # If the file already exists, delete it
+    if [ -f $1.mp4 ]; then
+        rm $1.mp4
+    fi
+
+    bash $1.$model.wts >/dev/null 2>&1
+    mv $1.mp4 $1.$model.mp4
+
+    ffmpeg -y -f lavfi -i color=c=black:s=1200x50:d=$DURATION -vf "drawtext=fontfile=$2:fontsize=36:x=10:y=(h-text_h)/2:text='ggml-$model - ${EXECTIME}s (${RATIO}x realtime)':fontcolor=lightgrey" $1.$model.info.mp4 >/dev/null 2>&1
+done
+
+COMMAND="ffmpeg -y"
+for model in $models; do
+    COMMAND="$COMMAND -i $1.$model.info.mp4 -i $1.$model.mp4"
+done
+COMMAND="$COMMAND -filter_complex \""
+COUNT=0
+for model in $models; do
+    COMMAND="$COMMAND[${COUNT}:v][$(($COUNT+1)):v]"
+    COUNT=$((COUNT+2))
+done
+COMMAND="$COMMAND vstack=inputs=${COUNT}[v]\" -map \"[v]\" -map 1:a $1.all.mp4 >/dev/null 2>&1"
+
+echo $COMMAND
+
+# Run the command
+eval $COMMAND
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -198,6 +198,8 @@ struct ggml_object;
 struct ggml_context;

 enum ggml_type {
+    GGML_TYPE_Q4_0,
+    GGML_TYPE_Q4_1,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@ -226,7 +228,9 @@ enum ggml_op {
    GGML_OP_STEP,
    GGML_OP_RELU,
    GGML_OP_GELU,
+    GGML_OP_SILU,
    GGML_OP_NORM, // normalize
+    GGML_OP_RMS_NORM,

    GGML_OP_MUL_MAT,

@ -312,6 +316,7 @@ struct ggml_init_params {
    // memory pool
    size_t mem_size;   // bytes
    void * mem_buffer; // if NULL, memory will be allocated internally
+    bool   no_alloc;   // don't allocate memory for the tensor data
 };

 void    ggml_time_init(void); // call this once at the beginning of the program
@ -326,7 +331,10 @@ void ggml_print_objects(const struct ggml_context * ctx);
 int    ggml_nelements(const struct ggml_tensor * tensor);
 size_t ggml_nbytes   (const struct ggml_tensor * tensor);

-size_t ggml_type_size   (enum ggml_type type);
+int    ggml_blck_size (enum ggml_type type);
+size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
+float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
+
 size_t ggml_element_size(const struct ggml_tensor * tensor);

 struct ggml_context * ggml_init(struct ggml_init_params params);
@ -336,6 +344,13 @@ size_t ggml_used_mem(const struct ggml_context * ctx);

 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

+bool ggml_mlock_supported(void);
+bool ggml_mlock(
+        struct ggml_context * ctx,
+        const void *opt_extra_addr,
+        size_t opt_extra_len,
+        char **err_p);
+
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
@ -466,12 +481,20 @@ struct ggml_tensor * ggml_gelu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

+struct ggml_tensor * ggml_silu(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+
 // normalize along rows
 // TODO: eps is hardcoded to 1e-5 for now
 struct ggml_tensor * ggml_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);

+struct ggml_tensor * ggml_rms_norm(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a);
+
 // A: m rows, n columns
 // B: p rows, n columns (i.e. we transpose it internally)
 // result is m columns, p rows
@ -726,6 +749,13 @@ enum ggml_opt_result ggml_opt(
        struct ggml_opt_params params,
        struct ggml_tensor * f);

+//
+// quantization
+//
+
+size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
+size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
+
 //
 // system info
 //
--- a/models/README.md
+++ b/models/README.md
@ -6,7 +6,7 @@ using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either
 the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
 script to download the already converted models. Currently, they are hosted on the following locations:

- https://huggingface.co/datasets/ggerganov/whisper.cpp
+- https://huggingface.co/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com

 Sample usage:
@ -23,7 +23,7 @@ You can now use it like this:

 A third option to obtain the model files is to download them from Hugging Face:

-https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
+https://huggingface.co/ggerganov/whisper.cpp/tree/main

 ## Available models

--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@ -79,11 +79,11 @@ dir_model   = sys.argv[1]
 dir_whisper = sys.argv[2]
 dir_out     = sys.argv[3]

-with open(dir_model + "/vocab.json", "r") as f:
+with open(dir_model + "/vocab.json", "r", encoding="utf8") as f:
    encoder = json.load(f)
-with open(dir_model + "/added_tokens.json", "r") as f:
+with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f:
    encoder_added = json.load(f)
-with open(dir_model + "/config.json", "r") as f:
+with open(dir_model + "/config.json", "r", encoding="utf8") as f:
    hparams = json.load(f)

 model = WhisperForConditionalGeneration.from_pretrained(dir_model)
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -40,7 +40,7 @@ if exist "ggml-%model%.bin" (
  goto :eof
 )

-PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
+PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"

 if %ERRORLEVEL% neq 0 (
  echo Failed to download ggml model %model%
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -6,13 +6,13 @@
 #src="https://ggml.ggerganov.com"
 #pfx="ggml-model-whisper"

-src="https://huggingface.co/datasets/ggerganov/whisper.cpp"
+src="https://huggingface.co/ggerganov/whisper.cpp"
 pfx="resolve/main/ggml"

 # get the path of this script
 function get_script_path() {
    if [ -x "$(command -v realpath)" ]; then
-        echo "$(dirname $(realpath $0))"
+        echo "$(dirname "$(realpath "$0")")"
    else
        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
        echo "$ret"
--- a/talk-ggama.sh
+++ b/talk-ggama.sh
@ -0,0 +1,6 @@
+./talk-llama \
+    -mw ./models/ggml-small.en.bin \
+    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
+    --name-ni "Georgi" \
+    --name-ai "GGaMA" \
+    -t 8 -vid 1 --speak ./examples/talk-llama/speak.sh
--- a/talk-llama.sh
+++ b/talk-llama.sh
@ -0,0 +1,6 @@
+./talk-llama \
+    -mw ./models/ggml-small.en.bin \
+    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
+    --name-ni "Georgi" \
+    --name-ai "LLaMA" \
+    -t 8 -vid 0 --speak ./examples/talk-llama/speak.sh
--- a/talk-rrama.sh
+++ b/talk-rrama.sh
@ -0,0 +1,6 @@
+./talk-llama \
+    -mw ./models/ggml-small.en.bin \
+    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
+    --name-ni "Georgi" \
+    --name-ai "RRaMA" \
+    -t 8 -vid 3 --speak ./examples/talk-llama/speak.sh
--- a/talk-ssama.sh
+++ b/talk-ssama.sh
@ -0,0 +1,6 @@
+./talk-llama \
+    -mw ./models/ggml-small.en.bin \
+    -ml ../llama.cpp/models/13B/ggml-model-q4_0.bin \
+    --name-ni "Georgi" \
+    --name-ai "SSaMA" \
+    -t 8 -vid 2 --speak ./examples/talk-llama/speak.sh
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -66,6 +66,7 @@ extern "C" {
    //

    struct whisper_context;
+    struct whisper_state;

    typedef int whisper_token;

@ -101,11 +102,20 @@ extern "C" {
    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);

-    // Frees all memory allocated by the model.
-    WHISPER_API void whisper_free(struct whisper_context * ctx);
+    // These are the same as the above, but the internal state of the context is not allocated automatically
+    // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
+    WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
+    WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
+    WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
+
+    WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
+
+    // Frees all allocated memory
+    WHISPER_API void whisper_free      (struct whisper_context * ctx);
+    WHISPER_API void whisper_free_state(struct whisper_state * state);

    // Convert RAW PCM audio to log mel spectrogram.
-    // The resulting spectrogram is stored inside the provided whisper context.
+    // The resulting spectrogram is stored inside the default state of the provided whisper context.
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel(
            struct whisper_context * ctx,
@ -113,17 +123,30 @@ extern "C" {
                               int   n_samples,
                               int   n_threads);

-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. 
-    // The resulting spectrogram is stored inside the provided whisper context.
+    WHISPER_API int whisper_pcm_to_mel_with_state(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+                       const float * samples,
+                               int   n_samples,
+                               int   n_threads);
+
+    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
+    // The resulting spectrogram is stored inside the default state of the provided whisper context.
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
-        struct whisper_context* ctx,
-        const float* samples,
-        int   n_samples,
-        int   n_threads);
+        struct whisper_context * ctx,
+                   const float * samples,
+                           int   n_samples,
+                           int   n_threads);

+    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
+        struct whisper_context * ctx,
+          struct whisper_state * state,
+                   const float * samples,
+                           int   n_samples,
+                           int   n_threads);

-    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
+    // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
    // n_mel must be 80
    // Returns 0 on success
@ -133,7 +156,14 @@ extern "C" {
                               int   n_len,
                               int   n_mel);

-    // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
+    WHISPER_API int whisper_set_mel_with_state(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+                       const float * data,
+                               int   n_len,
+                               int   n_mel);
+
+    // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
    // offset can be used to specify the offset of the first frame in the spectrogram.
    // Returns 0 on success
@ -142,6 +172,12 @@ extern "C" {
                               int   offset,
                               int   n_threads);

+    WHISPER_API int whisper_encode_with_state(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+                               int   offset,
+                               int   n_threads);
+
    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
    // Make sure to call whisper_encode() first.
    // tokens + n_tokens is the provided context for the decoder.
@ -155,6 +191,14 @@ extern "C" {
                               int   n_past,
                               int   n_threads);

+    WHISPER_API int whisper_decode_with_state(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+               const whisper_token * tokens,
+                               int   n_tokens,
+                               int   n_past,
+                               int   n_threads);
+
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
@ -190,20 +234,44 @@ extern "C" {
                               int   n_threads,
                             float * lang_probs);

-    WHISPER_API int whisper_n_len          (struct whisper_context * ctx); // mel length
-    WHISPER_API int whisper_n_vocab        (struct whisper_context * ctx);
-    WHISPER_API int whisper_n_text_ctx     (struct whisper_context * ctx);
-    WHISPER_API int whisper_n_audio_ctx    (struct whisper_context * ctx);
-    WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
+    WHISPER_API int whisper_lang_auto_detect_with_state(
+            struct whisper_context * ctx,
+              struct whisper_state * state,
+                               int   offset_ms,
+                               int   n_threads,
+                             float * lang_probs);
+
+    WHISPER_API int whisper_n_len           (struct whisper_context * ctx); // mel length
+    WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
+    WHISPER_API int whisper_n_vocab         (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_text_ctx      (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
+    WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
+
+    WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_f16          (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);

    // Token logits obtained from the last call to whisper_decode()
    // The logits for the last token are stored in the last row
    // Rows: n_tokens
    // Cols: n_vocab
-    WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
+    WHISPER_API float * whisper_get_logits           (struct whisper_context * ctx);
+    WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);

    // Token Id -> String. Uses the vocabulary in the provided context
    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
+    WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
+

    // Special tokens
    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
@ -218,7 +286,7 @@ extern "C" {
    WHISPER_API whisper_token whisper_token_translate (void);
    WHISPER_API whisper_token whisper_token_transcribe(void);

-    // Performance information
+    // Performance information from the default state.
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);

@ -236,18 +304,22 @@ extern "C" {
    // Text segment callback
    // Called on every newly generated text segment
    // Use the whisper_full_...() functions to obtain the text segments
-    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
+    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
+
+    // Progress callback
+    typedef void (*whisper_progress_callback)(struct whisper_context * ctx, struct whisper_state * state, int progress, void * user_data);

    // Encoder begin callback
    // If not NULL, called before the encoder starts
    // If it returns false, the computation is aborted
-    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
+    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);

    // Logits filter callback
    // Can be used to modify the logits before sampling
    // If not NULL, called after applying temperature to logits
    typedef void (*whisper_logits_filter_callback)(
            struct whisper_context * ctx,
+              struct whisper_state * state,
          const whisper_token_data * tokens,
                               int   n_tokens,
                             float * logits,
@ -287,6 +359,7 @@ extern "C" {

        // tokens to provide to the whisper decoder as initial prompt
        // these are prepended to any existing text context from a previous call
+        const char * initial_prompt;
        const whisper_token * prompt_tokens;
        int prompt_n_tokens;

@ -322,6 +395,10 @@ extern "C" {
        whisper_new_segment_callback new_segment_callback;
        void * new_segment_callback_user_data;

+        // called on each progress update
+        whisper_progress_callback progress_callback;
+        void * progress_callback_user_data;
+
        // called each time before the encoder starts
        whisper_encoder_begin_callback encoder_begin_callback;
        void * encoder_begin_callback_user_data;
@ -334,6 +411,7 @@ extern "C" {
    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);

    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
+    // Not thread safe for same context
    // Uses the specified decoding strategy to obtain the text.
    WHISPER_API int whisper_full(
                struct whisper_context * ctx,
@ -341,7 +419,16 @@ extern "C" {
                           const float * samples,
                                   int   n_samples);

-    // Split the input audio in chunks and process each chunk separately using whisper_full()
+    WHISPER_API int whisper_full_with_state(
+                struct whisper_context * ctx,
+                  struct whisper_state * state,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples);
+
+    // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
+    // Result is stored in the default state of the context
+    // Not thread safe if executed in parallel on the same context.
    // It seems this approach can offer some speedup in some cases.
    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
    WHISPER_API int whisper_full_parallel(
@ -351,40 +438,56 @@ extern "C" {
                                   int   n_samples,
                                   int   n_processors);

-    // Number of generated text segments.
+    // Number of generated text segments
    // A segment can be a few words, a sentence, or even a paragraph.
-    WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
+    WHISPER_API int whisper_full_n_segments           (struct whisper_context * ctx);
+    WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);

-    // Language id associated with the current context
+    // Language id associated with the context's default state
    WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);

-    // Get the start and end time of the specified segment.
-    WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
-    WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
+    // Language id associated with the provided state
+    WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);

-    // Get the text of the specified segment.
-    WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
+    // Get the start and end time of the specified segment
+    WHISPER_API int64_t whisper_full_get_segment_t0           (struct whisper_context * ctx, int i_segment);
+    WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);

-    // Get number of tokens in the specified segment.
-    WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);
+    WHISPER_API int64_t whisper_full_get_segment_t1           (struct whisper_context * ctx, int i_segment);
+    WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);

-    // Get the token text of the specified token in the specified segment.
-    WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
-    WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
+    // Get the text of the specified segment
+    WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
+    WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);

-    // Get token data for the specified token in the specified segment.
+    // Get number of tokens in the specified segment
+    WHISPER_API int whisper_full_n_tokens           (struct whisper_context * ctx, int i_segment);
+    WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
+
+    // Get the token text of the specified token in the specified segment
+    WHISPER_API const char * whisper_full_get_token_text           (struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
+
+    WHISPER_API whisper_token whisper_full_get_token_id           (struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
+
+    // Get token data for the specified token in the specified segment
    // This contains probabilities, timestamps, etc.
-    WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API whisper_token_data whisper_full_get_token_data           (struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);

-    // Get the probability of the specified token in the specified segment.
-    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
+    // Get the probability of the specified token in the specified segment
+    WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);

    ////////////////////////////////////////////////////////////////////////////

    // Temporary helpers needed for exposing ggml interface

    WHISPER_API int whisper_bench_memcpy(int n_threads);
+    WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
    WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
+    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);

 #ifdef __cplusplus
 }
Author	SHA1	Message	Date
Georgi Gerganov	c456ca476b	llama podcast	2023-04-01 13:13:27 +03:00
pajowu	0a2d1210bc	whisper : add progress callback (#600 )	2023-03-30 20:29:29 +03:00
Zigfrid Zvezdin	859ffc994e	misc : typo (#688 )	2023-03-30 07:51:33 +03:00
InconsolableCellist	5e6e2187a3	talk-llama : fixing usage message for talk-llama (#687 ) "-ml" instead of "-mg" for specifying the llama file	2023-03-30 00:10:20 +03:00
Georgi Gerganov	a7f1f33715	main : add <cstring> header	2023-03-29 23:59:45 +03:00
Lucas Zanek	86ecfc6333	whisper.addon : fixed test to new async implementation (#686 ) * fixed blocking code on node addon * modify the example to run async * format * added logic to see the whisper output * added logic to see the whisper output * removed extra function for more clean example * fixed whisper test to new async implementation	2023-03-29 23:59:17 +03:00
be-next	18e6fb0287	models : handle spaces and special characters in shell script paths (#677 ) This commit modifies the `get_script_path` function to correctly handle spaces and special characters in directory paths. The fix involves adding double quotes around variables and commands where needed to ensure proper parsing of paths with spaces and special characters.	2023-03-29 23:38:33 +03:00
Egor Egorov	0f759f125d	main : fix typo in JSON output (#648 ) * typo in JSON output * fix double quotes in JSON output	2023-03-29 23:26:39 +03:00
Jhen-Jie Hong	eefed45e37	whisper : add initial_prompt param (#645 )	2023-03-29 23:23:23 +03:00
clach04	aac1710afb	make : 32-bit ARM flags (#486 ) * issue #470 - working 32-bit ARM * Update Makefile * Update Makefile --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-29 23:11:35 +03:00
Jonno	21c1e6afc5	whisper.swiftui : update README.md (#682 ) - Slight tweaks to README for improved comprehension.	2023-03-29 23:04:38 +03:00
Evan Jones	a47e812a54	talk-llama : add alpaca support (#668 )	2023-03-29 23:01:14 +03:00
Georgi Gerganov	42c6855103	whisper : bump "large" scratch buffer even mode (close #671 )	2023-03-28 10:50:49 +03:00
Georgi Gerganov	0be9cd3497	whisper : increase scratch buffers after recent change (#671 ) Should fix the error: ggml_new_tensor_impl: not enough space in the scratch memory	2023-03-28 10:36:16 +03:00
Georgi Gerganov	e5c197d8aa	talk-llama : add discussion link	2023-03-28 10:11:34 +03:00
Georgi Gerganov	7cd1d3bc34	talk-llama : try to fix windows build ..	2023-03-27 22:40:59 +03:00
Georgi Gerganov	82637b8e9f	readme : add talk-llama example to the table	2023-03-27 21:02:35 +03:00
Georgi Gerganov	4a0deb8b1e	talk-llama : add new example + sync ggml from llama.cpp (#664 ) * talk-llama : talk with LLaMA AI * talk.llama : disable EOS token * talk-llama : add README instructions * ggml : fix build in debug	2023-03-27 21:00:32 +03:00
Georgi Gerganov	8e361d90d7	whisper : disable fallbacks until the performance is improved (#588 )	2023-03-22 22:34:39 +02:00
Andrew Huynh	fc49c44426	cmake : add a flag to disable F16C (#628 )	2023-03-22 22:30:40 +02:00
jwijffels	aec01bb337	Include link to R wrapper in README (#626 )	2023-03-22 22:28:22 +02:00
Lucas Zanek	21165580a1	Nodejs Addon blocking main thread. Implemented Napi::AsyncWorker (#642 ) * fixed blocking code on node addon * modify the example to run async * format * added logic to see the whisper output * added logic to see the whisper output * removed extra function for more clean example	2023-03-22 22:19:22 +02:00
Jhen-Jie Hong	1d749919e3	whisper.objc : add `-O3 -DNDEBUG` in release mode (#640 )	2023-03-22 22:16:04 +02:00
sandrohanea	d4fa0d92ad	fixed language auto-detection for state provided processing (#627 ) Co-authored-by: Sandro Hanea <sandrohanea@microsoft.com>	2023-03-22 21:47:09 +02:00
Jhen-Jie Hong	a5e60c019d	readme : add react-native bindings (#619 )	2023-03-22 21:39:02 +02:00
Leo Moll	8fcd1a3b32	main : provide option for creating JSON output (#615 ) * examples : provide option for exporting also as JSON file (ggerganov/whisper.cpp#614) * main : remove leftovers --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-22 21:37:36 +02:00
Kamilake	992aa2cd1b	models : change default encoding to utf8 (#605 )	2023-03-22 21:17:24 +02:00
Georgi Gerganov	4aa3bcf8a4	make : fix MUSL Linux build (#576 )	2023-03-22 20:51:42 +02:00
Georgi Gerganov	1beff6f66d	models : change HF hosting from dataset to model	2023-03-22 20:44:56 +02:00
Takeshi Inoue	09e9068007	whisper.android : support benchmark for Android example. (#542 ) * whisper.android: Support benchmark for Android example. * whisper.android: update screenshot in README. * update: Make text selectable for copy & paste. * Update whisper.h to restore API name Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * whisper.android: Restore original API names. --------- Co-authored-by: tinoue <tinoue@xevo.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-07 21:36:30 +02:00
Georgi Gerganov	fa9d43181f	readme : add bench-wts.sh demo	2023-03-06 21:06:27 +02:00
Georgi Gerganov	bb6b54a03d	bench-wts.sh : rename script + add execute permission	2023-03-06 21:02:24 +02:00
venkr	b597c5a779	qual-bench.sh : add quality comparison tool, and update main.cpp to allow using a font file (#569 )	2023-03-06 19:18:11 +02:00
Takeshi Inoue	a3fb6c507f	whisper.android : enable fp16 instrinsics (FP16_VA) which is supported by ARMv8.2 or later. (#572 )	2023-03-06 19:15:57 +02:00
sandrohanea	59fdcd19c8	whisper : add whisper_state + default state on the whisper_context (#523 ) * Added whisper state + default state on the whisper_context * Fixed some examples and bindings * Fixed whisper_n_len (which was used in some binding) and added whisper_n_len_from_state * Fixed comments * whisper : reuse kv_cache_free() and fix compiler warnings * whisper : clean-up the API comments --------- Co-authored-by: Sandro Hanea <sandrohanea@microsoft.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-05 21:42:19 +02:00
Georgi Gerganov	478289a4b3	whisper : set no_context == true by default (#537 )	2023-03-05 20:53:43 +02:00
polarmoon	5e94129cb2	go : NewContext now returns a clean context (#537 ) Co-authored-by: Ming <ming@localhost>	2023-03-05 20:50:25 +02:00
HY. Kelvin Lee	72af0f5697	main : add csv header (#552 )	2023-03-02 18:32:16 +02:00
Georgi Gerganov	af005d573f	make : add -DNDEBUG compile flag	2023-02-28 23:27:54 +02:00