rebase on master after whisper_state changes

coreml : simlpify whisper_encode + log messages
coreml : use Core ML encoder inference
2023-03-26 16:09:06 +03:00 · 2023-03-26 15:48:45 +03:00 · 2023-03-26 15:48:41 +03:00 · 2023-03-22 22:34:39 +02:00 · 2023-03-22 22:30:40 +02:00 · 2023-03-22 22:28:22 +02:00
44 changed files with 2147 additions and 2493 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,5 +1,7 @@
 *.o
 *.a
 *.mlmodel
 *.mlmodelc
 .cache/
 .vs/
 .vscode/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.0)
-project(whisper.cpp VERSION 1.2.0)
+project(whisper.cpp VERSION 1.2.1)
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -54,6 +54,8 @@ if (APPLE)
    option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
    option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
    option(WHISPER_NO_FMA              "whisper: disable FMA" OFF)
    option(WHISPER_COREML              "whisper: enable Core ML framework" OFF)
 else()
    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
@ -86,9 +88,12 @@ endif()
 find_package(Threads REQUIRED)
-# on APPLE - include Accelerate framework
+# on APPLE
-if (APPLE AND NOT WHISPER_NO_ACCELERATE)
+if (APPLE)
    # include Accelerate framework
    if (NOT WHISPER_NO_ACCELERATE)
        find_library(ACCELERATE_FRAMEWORK Accelerate)
        if (ACCELERATE_FRAMEWORK)
            message(STATUS "Accelerate framework found")
@ -97,6 +102,20 @@ if (APPLE AND NOT WHISPER_NO_ACCELERATE)
        else()
            message(WARNING "Accelerate framework not found")
        endif()
    endif()
    if (WHISPER_COREML)
        find_library(FOUNDATION_FRAMEWORK Foundation)
        find_library(COREML_FRAMEWORK CoreML)
        if (COREML_FRAMEWORK)
            message(STATUS "CoreML framework found")
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
        else()
            message(WARNING "CoreML framework not found")
        endif()
    endif()
 endif()
 if (WHISPER_SUPPORT_OPENBLAS)
@ -172,15 +191,44 @@ else()
            if(NOT WHISPER_NO_FMA)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
            endif()
            if(NOT WHISPER_NO_F16C)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
            endif()
        endif()
    endif()
 endif()
 if (WHISPER_PERF)
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
 endif()
 #
 # whisper.coreml - Core ML support
 #
 if (WHISPER_COREML)
    set(TARGET whisper.coreml)
    add_library(${TARGET}
        coreml/whisper-encoder.h
        coreml/whisper-encoder.mm
        coreml/whisper-encoder-impl.h
        coreml/whisper-encoder-impl.m
        )
    include(DefaultTargetOptions)
    target_include_directories(${TARGET} PUBLIC
        .
        )
    target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
    set_target_properties(${TARGET} PROPERTIES
        COMPILE_FLAGS "-fobjc-arc"
        )
 endif()
 #
 # whisper - this is the main library of the project
 #
@ -200,6 +248,10 @@ target_include_directories(${TARGET} PUBLIC
    .
    )
 if (WHISPER_COREML)
    target_link_libraries(${TARGET} PRIVATE whisper.coreml)
 endif()
 if (MSVC)
    target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
--- a/56
+++ b/56
@ -30,10 +30,16 @@ endif
 # Compile flags
 #
-CFLAGS   = -I.              -O3 -std=c11   -fPIC
+CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
+CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =
 # ref: https://github.com/ggerganov/whisper.cpp/issues/37
 ifneq ($(wildcard /usr/include/musl/*),)
 	CFLAGS   += -D_POSIX_SOURCE -D_GNU_SOURCE
 	CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
 endif
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@ -132,6 +138,10 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
 ifdef WHISPER_COREML
 	CXXFLAGS += -DWHISPER_USE_COREML
 	LDFLAGS  += -framework Foundation -framework CoreML
 endif
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
@ -141,6 +151,8 @@ ifdef WHISPER_GPROF
 	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	CFLAGS += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, 2, 3
@ -182,11 +194,23 @@ ggml.o: ggml.c ggml.h
 whisper.o: whisper.cpp whisper.h
 	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
-libwhisper.a: ggml.o whisper.o
+ifndef WHISPER_COREML
-	$(AR) rcs libwhisper.a ggml.o whisper.o
+WHISPER_OBJ = whisper.o
 else
 whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
 	$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
-libwhisper.so: ggml.o whisper.o
+whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
-	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
+	$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
 WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
 endif
 libwhisper.a: ggml.o $(WHISPER_OBJ)
 	$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
 libwhisper.so: ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
 clean:
 	rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
@ -200,21 +224,21 @@ CC_SDL=`sdl2-config --cflags --libs`
 SRC_COMMON = examples/common.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp
-main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
+main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
 	./main -h
-stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
-command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
-talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
-bench: examples/bench/bench.cpp ggml.o whisper.o
+bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
-	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
 #
 # Audio samples
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
-Stable: [v1.2.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -433,6 +433,19 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a
 ---
 ## Video comparison of different models
 Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:
 ```java
 ./extra/bench-wts.sh samples/jfk.wav
 ffplay ./samples/jfk.wav.all.mp4
 ```
 https://user-images.githubusercontent.com/1991296/223206245-2d36d903-cf8e-4f09-8c3b-eb9f9c39d6fc.mp4
 ---
 ## Benchmarks
 In order to have an objective comparison of the performance of the inference across different system configurations,
@ -453,7 +466,7 @@ The original models are converted to a custom binary format. This allows to pack
 You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
 or manually from here:
- https://huggingface.co/datasets/ggerganov/whisper.cpp
+- https://huggingface.co/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com
 For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
@ -463,6 +476,7 @@ in [models](models).
 - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
 - [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
  - React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
 - [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
 - [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
@ -471,6 +485,8 @@ in [models](models).
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
 - [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
 - [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
 ## Examples
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -17,7 +17,7 @@ import (
 // CONSTANTS
 const (
-	srcUrl  = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main" // The location of the models
+	srcUrl  = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models
 	srcExt  = ".bin"                                                      // Filename extension
 	bufSize = 1024 * 64                                                   // Size of the buffer used for downloading the model
 )
--- a/bindings/go/pkg/whisper/model.go
+++ b/bindings/go/pkg/whisper/model.go
@ -94,6 +94,7 @@ func (model *model) NewContext() (Context, error) {
 	params.SetPrintRealtime(false)
 	params.SetPrintTimestamps(false)
 	params.SetThreads(runtime.NumCPU())
 	params.SetNoContext(true)
 	// Return new context
 	return newContext(model, params)
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@ -20,7 +20,7 @@ extern bool callEncoderBegin(void* user_data);
 // Text segment callback
 // Called on every newly generated text segment
 // Use the whisper_full_...() functions to obtain the text segments
-static void whisper_new_segment_cb(struct whisper_context* ctx, int n_new, void* user_data) {
+static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_state* state, int n_new, void* user_data) {
    if(user_data != NULL && ctx != NULL) {
        callNewSegment(user_data, n_new);
    }
@ -29,7 +29,7 @@ static void whisper_new_segment_cb(struct whisper_context* ctx, int n_new, void*
 // Encoder begin callback
 // If not NULL, called before the encoder starts
 // If it returns false, the computation is aborted
-static bool whisper_encoder_begin_cb(struct whisper_context* ctx, void* user_data) {
+static bool whisper_encoder_begin_cb(struct whisper_context* ctx, struct whisper_state* state, void* user_data) {
    if(user_data != NULL && ctx != NULL) {
        return callEncoderBegin(user_data);
    }
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.2.0",
+  "version": "1.2.1",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@ -199,7 +199,7 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
  {
    static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-    rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
+    rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
      bool is_aborted = *(bool*)user_data;
      return !is_aborted;
    };
--- a/coreml/whisper-encoder-impl.h
+++ b/coreml/whisper-encoder-impl.h
@ -0,0 +1,142 @@
 //
 // CoremlEncoder.h
 //
 // This file was automatically generated and should not be edited.
 //
 #import <Foundation/Foundation.h>
 #import <CoreML/CoreML.h>
 #include <stdint.h>
 #include <os/log.h>
 NS_ASSUME_NONNULL_BEGIN
 /// Model Prediction Input Type
 API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoderInput : NSObject<MLFeatureProvider>
 /// melSegment as 1 × 80 × 3000 3-dimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * melSegment;
 - (instancetype)init NS_UNAVAILABLE;
 - (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER;
@end
 /// Model Prediction Output Type
 API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoderOutput : NSObject<MLFeatureProvider>
 /// output as multidimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * output;
 - (instancetype)init NS_UNAVAILABLE;
 - (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
@end
 /// Class for model loading and prediction
 API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoder : NSObject
@property (readonly, nonatomic, nullable) MLModel * model;
 /**
    URL of the underlying .mlmodelc directory.
 */
 + (nullable NSURL *)URLOfModelInThisBundle;
 /**
    Initialize CoremlEncoder instance from an existing MLModel object.
    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
 */
 - (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
 */
 - (nullable instancetype)init;
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Construct CoremlEncoder instance asynchronously with configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
 /**
    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param modelURL The model URL.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
 /**
    Make a prediction using the standard interface
    @param input an instance of CoremlEncoderInput to predict from
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the prediction as CoremlEncoderOutput
 */
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Make a prediction using the standard interface
    @param input an instance of CoremlEncoderInput to predict from
    @param options prediction options
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the prediction as CoremlEncoderOutput
 */
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Make a prediction using the convenience interface
    @param melSegment as 1 × 80 × 3000 3-dimensional array of floats:
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the prediction as CoremlEncoderOutput
 */
 - (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Batch prediction
    @param inputArray array of CoremlEncoderInput instances to obtain predictions from
    @param options prediction options
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the predictions as NSArray<CoremlEncoderOutput *>
 */
 - (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
@end
 NS_ASSUME_NONNULL_END
--- a/coreml/whisper-encoder-impl.m
+++ b/coreml/whisper-encoder-impl.m
@ -0,0 +1,197 @@
 //
 // CoremlEncoder.m
 //
 // This file was automatically generated and should not be edited.
 //
 #if !__has_feature(objc_arc)
 #error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
 #endif
 #import "whisper-encoder-impl.h"
@implementation CoremlEncoderInput
 - (instancetype)initWithMelSegment:(MLMultiArray *)melSegment {
    self = [super init];
    if (self) {
        _melSegment = melSegment;
    }
    return self;
 }
 - (NSSet<NSString *> *)featureNames {
    return [NSSet setWithArray:@[@"melSegment"]];
 }
 - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
    if ([featureName isEqualToString:@"melSegment"]) {
        return [MLFeatureValue featureValueWithMultiArray:self.melSegment];
    }
    return nil;
 }
@end
@implementation CoremlEncoderOutput
 - (instancetype)initWithOutput:(MLMultiArray *)output {
    self = [super init];
    if (self) {
        _output = output;
    }
    return self;
 }
 - (NSSet<NSString *> *)featureNames {
    return [NSSet setWithArray:@[@"output"]];
 }
 - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
    if ([featureName isEqualToString:@"output"]) {
        return [MLFeatureValue featureValueWithMultiArray:self.output];
    }
    return nil;
 }
@end
@implementation CoremlEncoder
 /**
    URL of the underlying .mlmodelc directory.
 */
 + (nullable NSURL *)URLOfModelInThisBundle {
    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"];
    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; }
    return [NSURL fileURLWithPath:assetPath];
 }
 /**
    Initialize CoremlEncoder instance from an existing MLModel object.
    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
 */
 - (instancetype)initWithMLModel:(MLModel *)model {
    self = [super init];
    if (!self) { return nil; }
    _model = model;
    if (_model == nil) { return nil; }
    return self;
 }
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
 */
 - (nullable instancetype)init {
    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
 }
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
 }
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
    if (model == nil) { return nil; }
    return [self initWithMLModel:model];
 }
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
    if (model == nil) { return nil; }
    return [self initWithMLModel:model];
 }
 /**
    Construct CoremlEncoder instance asynchronously with configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
              configuration:configuration
          completionHandler:handler];
 }
 /**
    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param modelURL The model URL.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
    [MLModel loadContentsOfURL:modelURL
                 configuration:configuration
             completionHandler:^(MLModel *model, NSError *error) {
        if (model != nil) {
            CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model];
            handler(typedModel, nil);
        } else {
            handler(nil, error);
        }
    }];
 }
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
 }
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
    if (!outFeatures) { return nil; }
    return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
 }
 - (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment];
    return [self predictionFromFeatures:input_ error:error];
 }
 - (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
    if (!outBatch) { return nil; }
    NSMutableArray<CoremlEncoderOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
    for (NSInteger i = 0; i < outBatch.count; i++) {
        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
        CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
        [results addObject:result];
    }
    return results;
 }
@end
--- a/coreml/whisper-encoder.h
+++ b/coreml/whisper-encoder.h
@ -0,0 +1,22 @@
 // Wrapper of the Core ML Whisper Encoder model
 //
 // Code is derived from the work of Github user @wangchou
 // ref: https://github.com/wangchou/callCoreMLFromCpp
 #if __cplusplus
 extern "C" {
 #endif
 struct whisper_coreml_context;
 struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
 void whisper_coreml_free(struct whisper_coreml_context * ctx);
 void whisper_coreml_encode(
        const whisper_coreml_context * ctx,
                               float * mel,
                               float * out);
 #if __cplusplus
 }
 #endif
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -0,0 +1,61 @@
 #import "coreml/whisper-encoder.h"
 #import "coreml/whisper-encoder-impl.h"
 #import <CoreML/CoreML.h>
 #include <stdlib.h>
 #if __cplusplus
 extern "C" {
 #endif
 struct whisper_coreml_context {
    const void * data;
 };
 struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
    NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
    NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
    const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]);
    if (data == NULL) {
        return NULL;
    }
    whisper_coreml_context * ctx = new whisper_coreml_context;
    ctx->data = data;
    return ctx;
 }
 void whisper_coreml_free(struct whisper_coreml_context * ctx) {
    CFRelease(ctx->data);
    delete ctx;
 }
 void whisper_coreml_encode(
        const whisper_coreml_context * ctx,
                               float * mel,
                               float * out) {
    MLMultiArray * inMultiArray = [
        [MLMultiArray alloc] initWithDataPointer: mel
                                           shape: @[@1, @80, @3000]
                                        dataType: MLMultiArrayDataTypeFloat32
                                         strides: @[@(240000), @(3000), @1]
                                     deallocator: nil
                                           error: nil
    ];
    CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil];
    MLMultiArray * outMA = outCoreML.output;
    memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
 }
 #if __cplusplus
 }
 #endif
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -72,7 +72,7 @@ int timestamp_to_sample(int64_t t, int n_samples) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
 }
-void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
+void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
@ -260,7 +260,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
@ -292,13 +292,41 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    return 0;
 }
-Napi::Object whisper(const Napi::CallbackInfo& info) {
+class Worker : public Napi::AsyncWorker {
 public:
  Worker(Napi::Function& callback, whisper_params params)
      : Napi::AsyncWorker(callback), params(params) {}
  void Execute() override {
    run(params, result);
  }
  void OnOK() override {
    Napi::HandleScope scope(Env());
    Napi::Object res = Napi::Array::New(Env(), result.size());
    for (uint64_t i = 0; i < result.size(); ++i) {
      Napi::Object tmp = Napi::Array::New(Env(), 3);
      for (uint64_t j = 0; j < 3; ++j) {
        tmp[j] = Napi::String::New(Env(), result[i][j]);
      }
      res[i] = tmp;
    }
    Callback().Call({Env().Null(), res});
  }
 private:
  whisper_params params;
  std::vector<std::vector<std::string>> result;
 };
 Napi::Value whisper(const Napi::CallbackInfo& info) {
  Napi::Env env = info.Env();
  if (info.Length() <= 0 || !info[0].IsObject()) {
    Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
  }
  whisper_params params;
    std::vector<std::vector<std::string>> result;
  Napi::Object whisper_params = info[0].As<Napi::Object>();
  std::string language = whisper_params.Get("language").As<Napi::String>();
@ -309,25 +337,10 @@ Napi::Object whisper(const Napi::CallbackInfo& info) {
  params.model = model;
  params.fname_inp.emplace_back(input);
-    // run model
+  Napi::Function callback = info[1].As<Napi::Function>();
-    run(params, result);
+  Worker* worker = new Worker(callback, params);
-
+  worker->Queue();
-    fprintf(stderr, "RESULT:\n");
+  return env.Undefined();
    for (auto sentence:result) {
        fprintf(stderr, "t0: %s, t1: %s, content: %s \n",
                sentence[0].c_str(), sentence[1].c_str(), sentence[2].c_str());
    }
    Napi::Object res = Napi::Array::New(env, result.size());
    for (uint64_t i = 0; i < result.size(); ++i) {
        Napi::Object tmp = Napi::Array::New(env, 3);
        for (uint64_t j = 0; j < 3; ++j) {
            tmp[j] = Napi::String::New(env, result[i][j]);
        }
        res[i] = tmp;
    }
    return res;
 }
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -1,10 +1,16 @@
-const path = require('path');
+const path = require("path");
-const { whisper } = require(path.join(__dirname, '../../build/Release/whisper-addon'));
+const { whisper } = require(path.join(
  __dirname,
  "../../build/Release/whisper-addon"
 ));
 const { promisify } = require("util");
 const whisperAsync = promisify(whisper);
 const whisperParams = {
-    language: 'en',
+  language: "en",
-    model: path.join(__dirname, '../../models/ggml-base.en.bin'),
+  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
-    fname_inp: '',
+  fname_inp: "../../samples/jfk.wav",
 };
 const arguments = process.argv.slice(2);
@ -14,7 +20,7 @@ const params = Object.fromEntries(
      return [...pre, item.slice(2).split("=")];
    }
    return pre;
-    }, []),
+  }, [])
 );
 for (const key in params) {
@ -23,5 +29,8 @@ for (const key in params) {
  }
 }
-console.log('whisperParams =', whisperParams);
+console.log("whisperParams =", whisperParams);
-console.log(whisper(whisperParams));
+
 whisperAsync(whisperParams).then((result) => {
  console.log(`Result from whisper: ${result}`);
 });
--- a/examples/helpers.js
+++ b/examples/helpers.js
@ -145,15 +145,7 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
                            var db = event.target.result;
                            var tx = db.transaction(['models'], 'readwrite');
                            var os = tx.objectStore('models');
                            var rq = null;
                            try {
                            var rq = os.put(data, url);
                            } catch (e) {
                                cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB: \n' + e);
                                cbCancel();
                                return;
                            }
                            rq.onsuccess = function (event) {
                                cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
@ -188,6 +180,7 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
    rq.onabort = function (event) {
        cbPrint('loadRemote: failed to open IndexedDB: abort');
-        cbCancel();
+
    };
 }
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -31,6 +31,7 @@ options:
  -osrt,     --output-srt        [false  ] output result in a srt file
  -owts,     --output-words      [false  ] output script for generating karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
  -oj,       --output-json       [false  ] output result in a JSON file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -73,6 +73,7 @@ struct whisper_params {
    bool output_srt     = false;
    bool output_wts     = false;
    bool output_csv     = false;
    bool output_jsn     = false;
    bool print_special  = false;
    bool print_colors   = false;
    bool print_progress = false;
@ -80,6 +81,7 @@ struct whisper_params {
    std::string language = "en";
    std::string prompt;
    std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
    std::string model    = "models/ggml-base.en.bin";
    std::vector<std::string> fname_inp = {};
@ -127,7 +129,9 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
        else if (arg == "-osrt" || arg == "--output-srt")     { params.output_srt     = true; }
        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
        else if (arg == "-fp"   || arg == "--font-path")      { params.font_path      = argv[++i]; }
        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
        else if (arg == "-oj"   || arg == "--output-json")    { params.output_jsn     = true; }
        else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
@ -174,7 +178,9 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ovtt,     --output-vtt        [%-7s] output result in a vtt file\n",                    params.output_vtt ? "true" : "false");
    fprintf(stderr, "  -osrt,     --output-srt        [%-7s] output result in a srt file\n",                    params.output_srt ? "true" : "false");
    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
@ -193,7 +199,7 @@ struct whisper_print_user_data {
    const std::vector<std::vector<float>> * pcmf32s;
 };
-void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
+void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
@ -352,28 +358,157 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
    const int n_segments = whisper_full_n_segments(ctx);
    fout << "start,end,text\n";
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text    << "\"\n";
+        fout << 10 * t0 << "," << 10 * t1 << ",\"" << text    << "\"\n";
    }
    return true;
 }
 bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
    std::ofstream fout(fname);
    int indent = 0;
    auto doindent = [&]() {
        for (int i = 0; i < indent; i++) fout << "\t";
    };
    auto start_arr = [&](const char *name) {
        doindent();
        fout << "\"" << name << "\": [\n";
        indent++;
    };
    auto end_arr = [&](bool end = false) {
        indent--;
        doindent();
        fout << (end ? "]\n" : "},\n");
    };
    auto start_obj = [&](const char *name = nullptr) {
        doindent();
        if (name) {
            fout << "\"" << name << "\": {\n";
        } else {
            fout << "{\n";
        }
        indent++;
    };
    auto end_obj = [&](bool end = false) {
        indent--;
        doindent();
        fout << (end ? "}\n" : "},\n");
    };
    auto start_value = [&](const char *name) {
        doindent();
        fout << "\"" << name << "\": ";
    };
    auto value_s = [&](const char *name, const char *val, bool end = false) {
        start_value(name);
        fout << "\"" << val << (end ? "\"\n" : "\",\n");
    };
    auto end_value = [&](bool end = false) {
        fout << (end ? "\n" : ",\n");
    };
    auto value_i = [&](const char *name, const int64_t val, bool end = false) {
        start_value(name);
        fout << val;
        end_value(end);
    };
    auto value_b = [&](const char *name, const bool val, bool end = false) {
        start_value(name);
        fout << (val ? "true" : "false");
        end_value(end);
    };
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
        return false;
    }
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
    start_obj();
        value_s("systeminfo", whisper_print_system_info());
        start_obj("model");
            value_s("type", whisper_model_type_readable(ctx));
            value_b("multilingual", whisper_is_multilingual(ctx));
            value_i("vocab", whisper_model_n_vocab(ctx));
            start_obj("audio");
                value_i("ctx", whisper_model_n_audio_ctx(ctx));
                value_i("state", whisper_model_n_audio_state(ctx));
                value_i("head", whisper_model_n_audio_head(ctx));
                value_i("layer", whisper_model_n_audio_layer(ctx), true);
            end_obj();
            start_obj("text");
                value_i("ctx", whisper_model_n_text_ctx(ctx));
                value_i("state", whisper_model_n_text_state(ctx));
                value_i("head", whisper_model_n_text_head(ctx));
                value_i("leyer", whisper_model_n_text_layer(ctx), true);
            end_obj();
            value_i("mels", whisper_model_n_mels(ctx));
            value_i("f16", whisper_model_f16(ctx), true);
        end_obj();
        start_obj("params");
            value_s("model", params.model.c_str());
            value_s("language", params.language.c_str());
            value_b("translate", params.translate, true);
        end_obj();
        start_obj("result");
            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
        end_obj();
        start_arr("transcription");
            const int n_segments = whisper_full_n_segments(ctx);
            for (int i = 0; i < n_segments; ++i) {
                const char * text = whisper_full_get_segment_text(ctx, i);
                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
                start_obj();
                    start_obj("timestanps");
                        value_s("from", to_timestamp(t0, true).c_str());
                        value_s("to", to_timestamp(t1, true).c_str(), true);
                    end_obj();
                    start_obj("offsets");
                        value_i("from", t0 * 10);
                        value_i("to", t1 * 10, true);
                    end_obj();
                    value_s("text", text, true);
                end_obj(i == (n_segments - 1));
            }
        end_arr(true);
    end_obj(true);
    return true;
 }
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
+bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
    std::ofstream fout(fname);
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
-    // TODO: become parameter
+    static const char * font = params.font_path.c_str();
-    static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
+
    std::ifstream fin(font);
    if (!fin.is_open()) {
        fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
        return false;
    }
    fout << "#!/bin/bash" << "\n";
    fout << "\n";
@ -607,7 +742,7 @@ int main(int argc, char ** argv) {
            {
                static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
-                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
+                wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
                    bool is_aborted = *(bool*)user_data;
                    return !is_aborted;
                };
@ -653,6 +788,12 @@ int main(int argc, char ** argv) {
                const auto fname_csv = fname_out + ".csv";
                output_csv(ctx, fname_csv.c_str());
            }
            // output to JSON file
            if (params.output_jsn) {
                const auto fname_jsn = fname_out + ".json";
                output_json(ctx, fname_jsn.c_str(), params);
            }
        }
    }
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -288,7 +288,6 @@ int main(int argc, char ** argv) {
            wparams.print_realtime   = false;
            wparams.print_timestamps = !params.no_timestamps;
            wparams.translate        = params.translate;
            wparams.no_context       = true;
            wparams.single_segment   = !use_vad;
            wparams.max_tokens       = params.max_tokens;
            wparams.language         = params.language.c_str();
--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -31,7 +31,7 @@ To run this, you will need a ggml GPT-2 model: [instructions](https://github.com
 Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:
 ```
-wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
+wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
 ```
 ## TTS
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -9,4 +9,4 @@ To use:
 5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
 [^1]: I recommend the tiny or base models for running on an Android device.
-<img width="300" alt="image" src="https://user-images.githubusercontent.com/1991296/208154256-82d972dc-221b-48c4-bfcb-36ce68602f93.png">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
@ -2,6 +2,7 @@ package com.whispercppdemo.ui.main
 import androidx.compose.foundation.layout.*
 import androidx.compose.foundation.rememberScrollState
 import androidx.compose.foundation.text.selection.SelectionContainer
 import androidx.compose.foundation.verticalScroll
 import androidx.compose.material3.*
 import androidx.compose.runtime.Composable
@ -19,6 +20,7 @@ fun MainScreen(viewModel: MainScreenViewModel) {
        canTranscribe = viewModel.canTranscribe,
        isRecording = viewModel.isRecording,
        messageLog = viewModel.dataLog,
        onBenchmarkTapped = viewModel::benchmark,
        onTranscribeSampleTapped = viewModel::transcribeSample,
        onRecordTapped = viewModel::toggleRecord
    )
@ -30,6 +32,7 @@ private fun MainScreen(
    canTranscribe: Boolean,
    isRecording: Boolean,
    messageLog: String,
    onBenchmarkTapped: () -> Unit,
    onTranscribeSampleTapped: () -> Unit,
    onRecordTapped: () -> Unit
 ) {
@ -45,8 +48,11 @@ private fun MainScreen(
                .padding(innerPadding)
                .padding(16.dp)
        ) {
-            Row(horizontalArrangement = Arrangement.SpaceBetween) {
+            Column(verticalArrangement = Arrangement.SpaceBetween) {
                Row(horizontalArrangement = Arrangement.SpaceBetween, modifier = Modifier.fillMaxWidth()) {
                    BenchmarkButton(enabled = canTranscribe, onClick = onBenchmarkTapped)
                    TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
                }
                RecordButton(
                    enabled = canTranscribe,
                    isRecording = isRecording,
@ -60,7 +66,16 @@ private fun MainScreen(
@Composable
 private fun MessageLog(log: String) {
    SelectionContainer() {
        Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
    }
 }
@Composable
 private fun BenchmarkButton(enabled: Boolean, onClick: () -> Unit) {
    Button(onClick = onClick, enabled = enabled) {
        Text("Benchmark")
    }
 }
@Composable
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
@ -41,10 +41,15 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
    init {
        viewModelScope.launch {
            printSystemInfo()
            loadData()
        }
    }
    private suspend fun printSystemInfo() {
        printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
    }
    private suspend fun loadData() {
        printMessage("Loading data...\n")
        try {
@ -81,10 +86,29 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        //whisperContext = WhisperContext.createContextFromFile(firstModel.absolutePath)
    }
    fun benchmark() = viewModelScope.launch {
        runBenchmark(6)
    }
    fun transcribeSample() = viewModelScope.launch {
        transcribeAudio(getFirstSample())
    }
    private suspend fun runBenchmark(nthreads: Int) {
        if (!canTranscribe) {
            return
        }
        canTranscribe = false
        printMessage("Running benchmark. This will take minutes...\n")
        whisperContext?.benchMemory(nthreads)?.let{ printMessage(it) }
        printMessage("\n")
        whisperContext?.benchGgmlMulMat(nthreads)?.let{ printMessage(it) }
        canTranscribe = true
    }
    private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) {
        samplesPath.listFiles()!!.first()
    }
@ -114,11 +138,14 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        canTranscribe = false
        try {
-            printMessage("Reading wave samples...\n")
+            printMessage("Reading wave samples... ")
            val data = readAudioSamples(file)
            printMessage("${data.size / (16000 / 1000)} ms\n")
            printMessage("Transcribing data...\n")
            val start = System.currentTimeMillis()
            val text = whisperContext?.transcribeData(data)
-            printMessage("Done: $text\n")
+            val elapsed = System.currentTimeMillis() - start
            printMessage("Done ($elapsed ms): $text\n")
        } catch (e: Exception) {
            Log.w(LOG_TAG, e)
            printMessage("${e.localizedMessage}\n")
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
@ -27,6 +27,14 @@ class WhisperContext private constructor(private var ptr: Long) {
        }
    }
    suspend fun benchMemory(nthreads: Int): String = withContext(scope.coroutineContext) {
        return@withContext WhisperLib.benchMemcpy(nthreads)
    }
    suspend fun benchGgmlMulMat(nthreads: Int): String = withContext(scope.coroutineContext) {
        return@withContext WhisperLib.benchGgmlMulMat(nthreads)
    }
    suspend fun release() = withContext(scope.coroutineContext) {
        if (ptr != 0L) {
            WhisperLib.freeContext(ptr)
@ -66,6 +74,10 @@ class WhisperContext private constructor(private var ptr: Long) {
            }
            return WhisperContext(ptr)
        }
        fun getSystemInfo(): String {
            return WhisperLib.getSystemInfo()
        }
    }
 }
@ -74,6 +86,7 @@ private class WhisperLib {
        init {
            Log.d(LOG_TAG, "Primary ABI: ${Build.SUPPORTED_ABIS[0]}")
            var loadVfpv4 = false
            var loadV8fp16 = false
            if (isArmEabiV7a()) {
                // armeabi-v7a needs runtime detection support
                val cpuInfo = cpuInfo()
@ -84,11 +97,24 @@ private class WhisperLib {
                        loadVfpv4 = true
                    }
                }
            } else if (isArmEabiV8a()) {
                // ARMv8.2a needs runtime detection support
                val cpuInfo = cpuInfo()
                cpuInfo?.let {
                    Log.d(LOG_TAG, "CPU info: $cpuInfo")
                    if (cpuInfo.contains("fphp")) {
                        Log.d(LOG_TAG, "CPU supports fp16 arithmetic")
                        loadV8fp16 = true
                    }
                }
            }
            if (loadVfpv4) {
                Log.d(LOG_TAG, "Loading libwhisper_vfpv4.so")
                System.loadLibrary("whisper_vfpv4")
            } else if (loadV8fp16) {
                Log.d(LOG_TAG, "Loading libwhisper_v8fp16_va.so")
                System.loadLibrary("whisper_v8fp16_va")
            } else {
                Log.d(LOG_TAG, "Loading libwhisper.so")
                System.loadLibrary("whisper")
@ -103,6 +129,9 @@ private class WhisperLib {
        external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
        external fun getTextSegmentCount(contextPtr: Long): Int
        external fun getTextSegment(contextPtr: Long, index: Int): String
        external fun getSystemInfo(): String
        external fun benchMemcpy(nthread: Int): String
        external fun benchGgmlMulMat(nthread: Int): String
    }
 }
@ -110,6 +139,10 @@ private fun isArmEabiV7a(): Boolean {
    return Build.SUPPORTED_ABIS[0].equals("armeabi-v7a")
 }
 private fun isArmEabiV8a(): Boolean {
    return Build.SUPPORTED_ABIS[0].equals("arm64-v8a")
 }
 private fun cpuInfo(): String? {
    return try {
        File("/proc/cpuinfo").inputStream().bufferedReader().use {
--- a/examples/whisper.android/app/src/main/jni/whisper/Android.mk
+++ b/examples/whisper.android/app/src/main/jni/whisper/Android.mk
@ -13,3 +13,14 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
 	LOCAL_CFLAGS += -mfpu=neon-vfpv4
 	include $(BUILD_SHARED_LIBRARY)
 endif
 ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
 	include $(CLEAR_VARS)
 	LOCAL_MODULE    := libwhisper_v8fp16_va
 	include $(LOCAL_PATH)/Whisper.mk
 	# Allow building NEON FMA code.
 	# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
 	LOCAL_CFLAGS += -march=armv8.2-a+fp16
 	include $(BUILD_SHARED_LIBRARY)
 endif
--- a/examples/whisper.android/app/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c
@ -6,6 +6,7 @@
 #include <sys/sysinfo.h>
 #include <string.h>
 #include "whisper.h"
 #include "ggml.h"
 #define UNUSED(x) (void)(x)
 #define TAG "JNI"
@ -214,3 +215,29 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
    jstring string = (*env)->NewStringUTF(env, text);
    return string;
 }
 JNIEXPORT jstring JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getSystemInfo(
        JNIEnv *env, jobject thiz
 ) {
    UNUSED(thiz);
    const char *sysinfo = whisper_print_system_info();
    jstring string = (*env)->NewStringUTF(env, sysinfo);
    return string;
 }
 JNIEXPORT jstring JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
                                                                      jint n_threads) {
    UNUSED(thiz);
    const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
    jstring string = (*env)->NewStringUTF(env, bench_ggml_memcpy);
 }
 JNIEXPORT jstring JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
                                                                          jint n_threads) {
    UNUSED(thiz);
    const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
    jstring string = (*env)->NewStringUTF(env, bench_ggml_mul_mat);
 }
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -24,3 +24,5 @@ Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Pha
 This can significantly improve the performance of the transcription:
 <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
 In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -296,6 +296,10 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
 				OTHER_CFLAGS = (
 					"-O3",
 					"-DNDEBUG",
 				);
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
 			};
--- a/examples/whisper.swiftui/README.md
+++ b/examples/whisper.swiftui/README.md
@ -7,8 +7,9 @@ To use:
 2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
 3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
 4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
-5. Select the "release" build configuration under "Run", then deploy and run to your device.
+5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device.
 [^1]: I recommend the tiny, base or small models for running on an iOS device.
 [^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
 ![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -430,6 +430,10 @@
 				LLVM_LTO = YES;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MARKETING_VERSION = 1.0;
 				OTHER_CFLAGS = (
 					"-O3",
 					"-DNDEBUG",
 				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
--- a/examples/whisper.wasm/CMakeLists.txt
+++ b/examples/whisper.wasm/CMakeLists.txt
@ -31,9 +31,9 @@ endif()
 set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
    --bind \
    -s USE_PTHREADS=1 \
-    -s PTHREAD_POOL_SIZE_STRICT=0 \
+    -s PTHREAD_POOL_SIZE=8 \
-    -s INITIAL_MEMORY=2000MB \
+    -s INITIAL_MEMORY=1500MB \
-    -s TOTAL_MEMORY=2000MB \
+    -s TOTAL_MEMORY=1500MB \
    -s FORCE_FILESYSTEM=1 \
    -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
    ${EXTRA_FLAGS} \
--- a/examples/whisper.wasm/emscripten.cpp
+++ b/examples/whisper.wasm/emscripten.cpp
@ -10,12 +10,6 @@ std::thread g_worker;
 std::vector<struct whisper_context *> g_contexts(4, nullptr);
 static inline int mpow2(int n) {
    int p = 1;
    while (p <= n) p *= 2;
    return p/2;
 }
 EMSCRIPTEN_BINDINGS(whisper) {
    emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
        if (g_worker.joinable()) {
@ -49,7 +43,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        }
    }));
-    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, int nthreads, bool translate) {
+    emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) {
        if (g_worker.joinable()) {
            g_worker.join();
        }
@ -72,7 +66,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
        params.print_special    = false;
        params.translate        = translate;
        params.language         = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
-        params.n_threads        = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency())));
+        params.n_threads        = std::min(8, (int) std::thread::hardware_concurrency());
        params.offset_ms        = 0;
        std::vector<float> pcmf32;
--- a/examples/whisper.wasm/index-tmpl.html
+++ b/examples/whisper.wasm/index-tmpl.html
@ -40,34 +40,21 @@
            Note that the computation is quite heavy and may take a few seconds to complete.<br>
            The transcription results will be displayed in the text area below.<br><br>
-            <b>Important:</b>
+            <b>Important: your browser must support WASM SIMD instructions for this to work.</b>
                <ul>
                    <li>your browser must support WASM SIMD instructions for this to work</li>
                    <li>quantized models are still in experimental stage (<a href="https://github.com/ggerganov/ggml/pull/27">more info</a>)</li>
                    <li>Firefox cannot load files larger than 256 MB - use Chrome instead</li>
                </ul>
-            <hr>
+            <br><br><hr>
            <div id="model">
-                Whisper models: <span id="model-whisper-status"></span><br><br>
+                Whisper model: <span id="model-whisper-status"></span>
                <button id="fetch-whisper-tiny-en"  onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
                <button id="fetch-whisper-tiny"     onclick="loadWhisper('tiny')">tiny (75 MB)</button>
                <button id="fetch-whisper-base-en"  onclick="loadWhisper('base.en')">base.en (142 MB)</button>
                <button id="fetch-whisper-base"     onclick="loadWhisper('base')">base (142 MB)</button>
                <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
                <button id="fetch-whisper-small"    onclick="loadWhisper('small')">small (466 MB)</button>
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
                <br><br>
                Quantized models:<br><br>
                <button id="fetch-whisper-base-en-q4_0"   onclick="loadWhisper('base-en-q4_0')">base.en (4bit, 49 MB)</button>
                <button id="fetch-whisper-base-q4_0"      onclick="loadWhisper('base-q4_0')">base (4bit, 49 MB)</button>
                <button id="fetch-whisper-small-en-q4_0"  onclick="loadWhisper('small-en-q4_0')">small.en (4bit, 152 MB)</button>
                <button id="fetch-whisper-small-q4_0"     onclick="loadWhisper('small-q4_0')">small (4bit, 152 MB)</button><br>
                <button id="fetch-whisper-medium-en-q4_0" onclick="loadWhisper('medium-en-q4_0')">medium.en (4bit, 469 MB)</button>
                <button id="fetch-whisper-medium-q4_0"    onclick="loadWhisper('medium-q4_0')">medium (4bit, 469 MB)</button>
                <button id="fetch-whisper-large-q4_0"     onclick="loadWhisper('large-q4_0')">large (4bit, 985 MB)</button>
                <span id="fetch-whisper-progress"></span>
                <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
            </div>
            <br>
@ -174,12 +161,6 @@
                            <option value="yi">Yiddish</option>
                        </select>
                    </td>
                    <!-- Slider to select number of threads between 1 and 16 -->
                    <td>
                        Threads:
                        <input type="range" id="threads" name="threads" min="1" max="16" value="8" onchange="changeThreads(this.value)" />
                        <span id="threads-value">8</span>
                    </td>
                    <td>
                        <button onclick="onProcess(false);">Transcribe</button>
                    </td>
@ -282,13 +263,11 @@
                Module.FS_createDataFile("/", fname, buf, true, true);
-                //model_whisper = fname;
+                model_whisper = fname;
                document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
                printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
                document.getElementById('model').innerHTML = 'Model fetched: ' + model_whisper;
            }
            function loadFile(event, fname) {
@ -313,15 +292,6 @@
                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q4_0'  ).style.display = 'none';
                document.getElementById('fetch-whisper-base-q4_0'     ).style.display = 'none';
                document.getElementById('fetch-whisper-small-en-q4_0' ).style.display = 'none';
                document.getElementById('fetch-whisper-small-q4_0'    ).style.display = 'none';
                document.getElementById('fetch-whisper-medium-en-q4_0').style.display = 'none';
                document.getElementById('fetch-whisper-medium-q4_0'   ).style.display = 'none';
                document.getElementById('fetch-whisper-large-q4_0'    ).style.display = 'none';
                document.getElementById('whisper-file'          ).style.display = 'none';
                document.getElementById('model-whisper-status'  ).innerHTML = 'loaded model: ' + file.name;
            }
@ -334,14 +304,6 @@
                    'base':     'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
                    'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
                    'small':    'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
                    'base-en-q4_0':  'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q4_0.bin',
                    'base-q4_0':     'https://whisper.ggerganov.com/ggml-model-whisper-base-q4_0.bin',
                    'small-en-q4_0': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q4_0.bin',
                    'small-q4_0':    'https://whisper.ggerganov.com/ggml-model-whisper-small-q4_0.bin',
                    'medium-en-q4_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q4_0.bin',
                    'medium-q4_0':   'https://whisper.ggerganov.com/ggml-model-whisper-medium-q4_0.bin',
                    'large-q4_0':    'https://whisper.ggerganov.com/ggml-model-whisper-large-q4_0.bin',
                };
                let sizes = {
@ -351,14 +313,6 @@
                    'base':     142,
                    'small.en': 466,
                    'small':    466,
                    'base-en-q4_0':   49,
                    'base-q4_0':      49,
                    'small-en-q4_0':  152,
                    'small-q4_0':     152,
                    'medium-en-q4_0': 469,
                    'medium-q4_0':    469,
                    'large-q4_0':     985,
                };
                let url     = urls[model];
@ -373,15 +327,6 @@
                document.getElementById('fetch-whisper-tiny'    ).style.display = 'none';
                document.getElementById('fetch-whisper-base'    ).style.display = 'none';
                document.getElementById('fetch-whisper-small'   ).style.display = 'none';
                document.getElementById('fetch-whisper-base-en-q4_0'  ).style.display = 'none';
                document.getElementById('fetch-whisper-base-q4_0'     ).style.display = 'none';
                document.getElementById('fetch-whisper-small-en-q4_0' ).style.display = 'none';
                document.getElementById('fetch-whisper-small-q4_0'    ).style.display = 'none';
                document.getElementById('fetch-whisper-medium-en-q4_0').style.display = 'none';
                document.getElementById('fetch-whisper-medium-q4_0'   ).style.display = 'none';
                document.getElementById('fetch-whisper-large-q4_0'    ).style.display = 'none';
                document.getElementById('whisper-file'          ).style.display = 'none';
                document.getElementById('model-whisper-status'  ).innerHTML = 'loading model: ' + model;
@ -392,22 +337,12 @@
                cbCancel = function() {
                    var el;
                    el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-tiny'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small'   ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-en-q4_0'  ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-base-q4_0'     ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-en-q4_0' ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-small-q4_0'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-medium-en-q4_0'); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-medium-q4_0'   ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('fetch-whisper-large-q4_0'    ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('whisper-file'          ); if (el) el.style.display = 'inline-block';
                    el = document.getElementById('model-whisper-status'  ); if (el) el.innerHTML = '';
                };
@ -419,8 +354,7 @@
            // audio file
            //
-            const kMaxAudio_s = 30*60;
+            const kMaxAudio_s = 120;
            const kMaxRecording_s = 2*60;
            const kSampleRate = 16000;
            window.AudioContext = window.AudioContext || window.webkitAudioContext;
@ -489,7 +423,7 @@
                doRecording = false;
            }
-            // record up to kMaxRecording_s seconds of audio from the microphone
+            // record up to kMaxAudio_s seconds of audio from the microphone
            // check if doRecording is false every 1000 ms and stop recording if so
            // update progress information
            function startRecording() {
@ -545,9 +479,9 @@
                                        printTextarea('js: audio recorded, size: ' + audio.length);
                                        // truncate to first 30 seconds
-                                        if (audio.length > kMaxRecording_s*kSampleRate) {
+                                        if (audio.length > kMaxAudio_s*kSampleRate) {
-                                            audio = audio.slice(0, kMaxRecording_s*kSampleRate);
+                                            audio = audio.slice(0, kMaxAudio_s*kSampleRate);
-                                            printTextarea('js: truncated audio to first ' + kMaxRecording_s + ' seconds');
+                                            printTextarea('js: truncated audio to first ' + kMaxAudio_s + ' seconds');
                                        }
                                        setAudio(audio);
                                    });
@ -575,31 +509,24 @@
                        });
                    }
-                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxRecording_s) + '%';
+                    document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxAudio_s) + '%';
-                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxRecording_s).toFixed(0) + '%';
+                    document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxAudio_s).toFixed(0) + '%';
                }, 1000);
                printTextarea('js: recording ...');
                setTimeout(function() {
                    if (doRecording) {
-                        printTextarea('js: recording stopped after ' + kMaxRecording_s + ' seconds');
+                        printTextarea('js: recording stopped after ' + kMaxAudio_s + ' seconds');
                        stopRecording();
                    }
-                }, kMaxRecording_s*1000);
+                }, kMaxAudio_s*1000);
            }
            //
            // transcribe
            //
            var nthreads = 8;
            function changeThreads(value) {
                nthreads = value;
                document.getElementById('threads-value').innerHTML = nthreads;
            }
            function onProcess(translate) {
                if (!instance) {
                    instance = Module.init('whisper.bin');
@ -626,7 +553,7 @@
                    printTextarea('');
                    setTimeout(function() {
-                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, nthreads, translate);
+                        var ret = Module.full_default(instance, audio, document.getElementById('language').value, translate);
                        console.log('js: full_default returned: ' + ret);
                        if (ret) {
                            printTextarea("js: whisper returned: " + ret);
--- a/extra/bench-wts.sh
+++ b/extra/bench-wts.sh
@ -0,0 +1,70 @@
 # Benchmark word-level timestamps for different models
 #
 # This script takes two arguments
 # - an audio file
 # - [optional] path to a font file
 # I'm using "/usr/share/fonts/truetype/freefont/FreeMono.ttf" on Ubuntu
 if [ -z "$1" ]; then
    echo "Usage: $0 <audio file> [font file]"
    exit 1
 fi
 #TODO: Make this a command line parameter
 #models="base small large"
 #models="tiny.en tiny base.en base small.en small medium.en medium large-v1 large"
 models="tiny.en base.en small.en medium.en large"
 DURATION=$(ffprobe -i $1 -show_entries format=duration -v quiet -of csv="p=0")
 DURATION=$(printf "%.2f" $DURATION)
 echo "Input file duration: ${DURATION}s"
 for model in $models; do
    echo "Running $model"
    COMMAND="./main -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
    if [ ! -z "$2" ]; then
        COMMAND="$COMMAND -fp $2"
    fi
    #TODO: Surface errors better
    # TIMEFMT is for zsh, TIMEFORMAT is for bash
    EXECTIME=$({ TIMEFMT="%E";TIMEFORMAT=%E; time $COMMAND >/dev/null 2>&1; } 2>&1)
    # Slightly different formats between zsh and bash
    if [ "${EXECTIME: -1}" == "s" ]; then
        EXECTIME=${EXECTIME::-1}
    fi
    RATIO=$(echo "$DURATION / $EXECTIME" | bc -l)
    RATIO=$(printf "%.2f" $RATIO)
    echo "Execution time: ${EXECTIME}s (${RATIO}x realtime)"
    # If the file already exists, delete it
    if [ -f $1.mp4 ]; then
        rm $1.mp4
    fi
    bash $1.$model.wts >/dev/null 2>&1
    mv $1.mp4 $1.$model.mp4
    ffmpeg -y -f lavfi -i color=c=black:s=1200x50:d=$DURATION -vf "drawtext=fontfile=$2:fontsize=36:x=10:y=(h-text_h)/2:text='ggml-$model - ${EXECTIME}s (${RATIO}x realtime)':fontcolor=lightgrey" $1.$model.info.mp4 >/dev/null 2>&1
 done
 COMMAND="ffmpeg -y"
 for model in $models; do
    COMMAND="$COMMAND -i $1.$model.info.mp4 -i $1.$model.mp4"
 done
 COMMAND="$COMMAND -filter_complex \""
 COUNT=0
 for model in $models; do
    COMMAND="$COMMAND[${COUNT}:v][$(($COUNT+1)):v]"
    COUNT=$((COUNT+2))
 done
 COMMAND="$COMMAND vstack=inputs=${COUNT}[v]\" -map \"[v]\" -map 1:a $1.all.mp4 >/dev/null 2>&1"
 echo $COMMAND
 # Run the command
 eval $COMMAND
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -198,8 +198,6 @@ struct ggml_object;
 struct ggml_context;
 enum ggml_type {
    GGML_TYPE_Q4_0,
    GGML_TYPE_Q4_1,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@ -328,10 +326,7 @@ void ggml_print_objects(const struct ggml_context * ctx);
 int    ggml_nelements(const struct ggml_tensor * tensor);
 size_t ggml_nbytes   (const struct ggml_tensor * tensor);
-int    ggml_blck_size (enum ggml_type type);
+size_t ggml_type_size   (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
 float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
 size_t ggml_element_size(const struct ggml_tensor * tensor);
 struct ggml_context * ggml_init(struct ggml_init_params params);
--- a/models/README.md
+++ b/models/README.md
@ -6,7 +6,7 @@ using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either
 the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
 script to download the already converted models. Currently, they are hosted on the following locations:
- https://huggingface.co/datasets/ggerganov/whisper.cpp
+- https://huggingface.co/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com
 Sample usage:
@ -23,7 +23,7 @@ You can now use it like this:
 A third option to obtain the model files is to download them from Hugging Face:
-https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
+https://huggingface.co/ggerganov/whisper.cpp/tree/main
 ## Available models
--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@ -79,11 +79,11 @@ dir_model   = sys.argv[1]
 dir_whisper = sys.argv[2]
 dir_out     = sys.argv[3]
-with open(dir_model + "/vocab.json", "r") as f:
+with open(dir_model + "/vocab.json", "r", encoding="utf8") as f:
    encoder = json.load(f)
-with open(dir_model + "/added_tokens.json", "r") as f:
+with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f:
    encoder_added = json.load(f)
-with open(dir_model + "/config.json", "r") as f:
+with open(dir_model + "/config.json", "r", encoding="utf8") as f:
    hparams = json.load(f)
 model = WhisperForConditionalGeneration.from_pretrained(dir_model)
--- a/models/download-coreml-model.sh
+++ b/models/download-coreml-model.sh
@ -0,0 +1,82 @@
 #!/bin/bash
 # This script downloads Whisper model files that have already been converted to Core ML format.
 # This way you don't have to convert them yourself.
 src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
 pfx="resolve/main/ggml"
 # get the path of this script
 function get_script_path() {
    if [ -x "$(command -v realpath)" ]; then
        echo "$(dirname $(realpath $0))"
    else
        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
        echo "$ret"
    fi
 }
 models_path="$(get_script_path)"
 # Whisper models
 models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
 # list available models
 function list_models {
    printf "\n"
    printf "  Available models:"
    for model in "${models[@]}"; do
        printf " $model"
    done
    printf "\n\n"
 }
 if [ "$#" -ne 1 ]; then
    printf "Usage: $0 <model>\n"
    list_models
    exit 1
 fi
 model=$1
 if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
    printf "Invalid model: $model\n"
    list_models
    exit 1
 fi
 # download Core ML model
 printf "Downloading Core ML model $model from '$src' ...\n"
 cd $models_path
 if [ -f "ggml-$model.mlmodel" ]; then
    printf "Model $model already exists. Skipping download.\n"
    exit 0
 fi
 if [ -x "$(command -v wget)" ]; then
    wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
 elif [ -x "$(command -v curl)" ]; then
    curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
 else
    printf "Either wget or curl is required to download models.\n"
    exit 1
 fi
 if [ $? -ne 0 ]; then
    printf "Failed to download Core ML model $model \n"
    printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
    exit 1
 fi
 printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
 printf "Run the following command to compile it:\n\n"
 printf "  $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
 printf "You can now use it like this:\n\n"
 printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
 printf "\n"
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -40,7 +40,7 @@ if exist "ggml-%model%.bin" (
  goto :eof
 )
-PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
+PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
 if %ERRORLEVEL% neq 0 (
  echo Failed to download ggml model %model%
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -6,7 +6,7 @@
 #src="https://ggml.ggerganov.com"
 #pfx="ggml-model-whisper"
-src="https://huggingface.co/datasets/ggerganov/whisper.cpp"
+src="https://huggingface.co/ggerganov/whisper.cpp"
 pfx="resolve/main/ggml"
 # get the path of this script
--- a/whisper.cpp
+++ b/whisper.cpp
--- a/whisper.h
+++ b/whisper.h
@ -66,6 +66,7 @@ extern "C" {
    //
    struct whisper_context;
    struct whisper_state;
    typedef int whisper_token;
@ -101,11 +102,20 @@ extern "C" {
    WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
    WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
-    // Frees all memory allocated by the model.
+    // These are the same as the above, but the internal state of the context is not allocated automatically
-    WHISPER_API void whisper_free(struct whisper_context * ctx);
+    // It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
    WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
    WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
    WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
    WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
    // Frees all allocated memory
    WHISPER_API void whisper_free      (struct whisper_context * ctx);
    WHISPER_API void whisper_free_state(struct whisper_state * state);
    // Convert RAW PCM audio to log mel spectrogram.
-    // The resulting spectrogram is stored inside the provided whisper context.
+    // The resulting spectrogram is stored inside the default state of the provided whisper context.
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel(
            struct whisper_context * ctx,
@ -113,17 +123,30 @@ extern "C" {
                               int   n_samples,
                               int   n_threads);
-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. 
+    WHISPER_API int whisper_pcm_to_mel_with_state(
-    // The resulting spectrogram is stored inside the provided whisper context.
+            struct whisper_context * ctx,
-    // Returns 0 on success
+              struct whisper_state * state,
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
+                       const float * samples,
        struct whisper_context* ctx,
        const float* samples,
                               int   n_samples,
                               int   n_threads);
    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
    // The resulting spectrogram is stored inside the default state of the provided whisper context.
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
        struct whisper_context * ctx,
                   const float * samples,
                           int   n_samples,
                           int   n_threads);
-    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
+    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
        struct whisper_context * ctx,
          struct whisper_state * state,
                   const float * samples,
                           int   n_samples,
                           int   n_threads);
    // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
    // n_mel must be 80
    // Returns 0 on success
@ -133,7 +156,14 @@ extern "C" {
                               int   n_len,
                               int   n_mel);
-    // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
+    WHISPER_API int whisper_set_mel_with_state(
            struct whisper_context * ctx,
              struct whisper_state * state,
                       const float * data,
                               int   n_len,
                               int   n_mel);
    // Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
    // offset can be used to specify the offset of the first frame in the spectrogram.
    // Returns 0 on success
@ -142,6 +172,12 @@ extern "C" {
                               int   offset,
                               int   n_threads);
    WHISPER_API int whisper_encode_with_state(
            struct whisper_context * ctx,
              struct whisper_state * state,
                               int   offset,
                               int   n_threads);
    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
    // Make sure to call whisper_encode() first.
    // tokens + n_tokens is the provided context for the decoder.
@ -155,6 +191,14 @@ extern "C" {
                               int   n_past,
                               int   n_threads);
    WHISPER_API int whisper_decode_with_state(
            struct whisper_context * ctx,
              struct whisper_state * state,
               const whisper_token * tokens,
                               int   n_tokens,
                               int   n_past,
                               int   n_threads);
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
@ -190,20 +234,44 @@ extern "C" {
                               int   n_threads,
                             float * lang_probs);
    WHISPER_API int whisper_lang_auto_detect_with_state(
            struct whisper_context * ctx,
              struct whisper_state * state,
                               int   offset_ms,
                               int   n_threads,
                             float * lang_probs);
    WHISPER_API int whisper_n_len           (struct whisper_context * ctx); // mel length
    WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
    WHISPER_API int whisper_n_vocab         (struct whisper_context * ctx);
    WHISPER_API int whisper_n_text_ctx      (struct whisper_context * ctx);
    WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
-    WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
+    WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
    WHISPER_API int whisper_model_f16          (struct whisper_context * ctx);
    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
    // Token logits obtained from the last call to whisper_decode()
    // The logits for the last token are stored in the last row
    // Rows: n_tokens
    // Cols: n_vocab
-    WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
+    WHISPER_API float * whisper_get_logits           (struct whisper_context * ctx);
    WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
    // Token Id -> String. Uses the vocabulary in the provided context
    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
    WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
    // Special tokens
    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
@ -218,7 +286,7 @@ extern "C" {
    WHISPER_API whisper_token whisper_token_translate (void);
    WHISPER_API whisper_token whisper_token_transcribe(void);
-    // Performance information
+    // Performance information from the default state.
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
@ -236,18 +304,19 @@ extern "C" {
    // Text segment callback
    // Called on every newly generated text segment
    // Use the whisper_full_...() functions to obtain the text segments
-    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
+    typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data);
    // Encoder begin callback
    // If not NULL, called before the encoder starts
    // If it returns false, the computation is aborted
-    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
+    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data);
    // Logits filter callback
    // Can be used to modify the logits before sampling
    // If not NULL, called after applying temperature to logits
    typedef void (*whisper_logits_filter_callback)(
            struct whisper_context * ctx,
              struct whisper_state * state,
          const whisper_token_data * tokens,
                               int   n_tokens,
                             float * logits,
@ -334,6 +403,7 @@ extern "C" {
    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
    // Not thread safe for same context
    // Uses the specified decoding strategy to obtain the text.
    WHISPER_API int whisper_full(
                struct whisper_context * ctx,
@ -341,7 +411,16 @@ extern "C" {
                           const float * samples,
                                   int   n_samples);
-    // Split the input audio in chunks and process each chunk separately using whisper_full()
+    WHISPER_API int whisper_full_with_state(
                struct whisper_context * ctx,
                  struct whisper_state * state,
            struct whisper_full_params   params,
                           const float * samples,
                                   int   n_samples);
    // Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
    // Result is stored in the default state of the context
    // Not thread safe if executed in parallel on the same context.
    // It seems this approach can offer some speedup in some cases.
    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
    WHISPER_API int whisper_full_parallel(
@ -351,40 +430,56 @@ extern "C" {
                                   int   n_samples,
                                   int   n_processors);
-    // Number of generated text segments.
+    // Number of generated text segments
    // A segment can be a few words, a sentence, or even a paragraph.
-    WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
+    WHISPER_API int whisper_full_n_segments           (struct whisper_context * ctx);
    WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
-    // Language id associated with the current context
+    // Language id associated with the context's default state
    WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
-    // Get the start and end time of the specified segment.
+    // Language id associated with the provided state
-    WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
+    WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
    WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
-    // Get the text of the specified segment.
+    // Get the start and end time of the specified segment
-    WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
+    WHISPER_API int64_t whisper_full_get_segment_t0           (struct whisper_context * ctx, int i_segment);
    WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
-    // Get number of tokens in the specified segment.
+    WHISPER_API int64_t whisper_full_get_segment_t1           (struct whisper_context * ctx, int i_segment);
-    WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);
+    WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
    // Get the text of the specified segment
    WHISPER_API const char * whisper_full_get_segment_text           (struct whisper_context * ctx, int i_segment);
    WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
    // Get number of tokens in the specified segment
    WHISPER_API int whisper_full_n_tokens           (struct whisper_context * ctx, int i_segment);
    WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
    // Get the token text of the specified token in the specified segment
    WHISPER_API const char * whisper_full_get_token_text           (struct whisper_context * ctx, int i_segment, int i_token);
    WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
    // Get the token text of the specified token in the specified segment.
    WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
    WHISPER_API whisper_token whisper_full_get_token_id           (struct whisper_context * ctx, int i_segment, int i_token);
    WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
-    // Get token data for the specified token in the specified segment.
+    // Get token data for the specified token in the specified segment
    // This contains probabilities, timestamps, etc.
-    WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API whisper_token_data whisper_full_get_token_data           (struct whisper_context * ctx, int i_segment, int i_token);
    WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
-    // Get the probability of the specified token in the specified segment.
+    // Get the probability of the specified token in the specified segment
-    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
+    WHISPER_API float whisper_full_get_token_p           (struct whisper_context * ctx, int i_segment, int i_token);
    WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
    ////////////////////////////////////////////////////////////////////////////
    // Temporary helpers needed for exposing ggml interface
    WHISPER_API int whisper_bench_memcpy(int n_threads);
    WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
    WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
 #ifdef __cplusplus
 }
Author	SHA1	Message	Date
Georgi Gerganov	0244810697	rebase on master after whisper_state changes	2023-03-26 16:09:06 +03:00
Georgi Gerganov	6efb04fc72	coreml : simlpify whisper_encode + log messages	2023-03-26 15:48:45 +03:00
Georgi Gerganov	ee0d6ff473	coreml : use Core ML encoder inference	2023-03-26 15:48:41 +03:00
Georgi Gerganov	8e361d90d7	whisper : disable fallbacks until the performance is improved (#588 )	2023-03-22 22:34:39 +02:00
Andrew Huynh	fc49c44426	cmake : add a flag to disable F16C (#628 )	2023-03-22 22:30:40 +02:00
jwijffels	aec01bb337	Include link to R wrapper in README (#626 )	2023-03-22 22:28:22 +02:00
Lucas Zanek	21165580a1	Nodejs Addon blocking main thread. Implemented Napi::AsyncWorker (#642 ) * fixed blocking code on node addon * modify the example to run async * format * added logic to see the whisper output * added logic to see the whisper output * removed extra function for more clean example	2023-03-22 22:19:22 +02:00
Jhen-Jie Hong	1d749919e3	whisper.objc : add `-O3 -DNDEBUG` in release mode (#640 )	2023-03-22 22:16:04 +02:00
sandrohanea	d4fa0d92ad	fixed language auto-detection for state provided processing (#627 ) Co-authored-by: Sandro Hanea <sandrohanea@microsoft.com>	2023-03-22 21:47:09 +02:00
Jhen-Jie Hong	a5e60c019d	readme : add react-native bindings (#619 )	2023-03-22 21:39:02 +02:00
Leo Moll	8fcd1a3b32	main : provide option for creating JSON output (#615 ) * examples : provide option for exporting also as JSON file (ggerganov/whisper.cpp#614) * main : remove leftovers --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-22 21:37:36 +02:00
Kamilake	992aa2cd1b	models : change default encoding to utf8 (#605 )	2023-03-22 21:17:24 +02:00
Georgi Gerganov	4aa3bcf8a4	make : fix MUSL Linux build (#576 )	2023-03-22 20:51:42 +02:00
Georgi Gerganov	1beff6f66d	models : change HF hosting from dataset to model	2023-03-22 20:44:56 +02:00
Takeshi Inoue	09e9068007	whisper.android : support benchmark for Android example. (#542 ) * whisper.android: Support benchmark for Android example. * whisper.android: update screenshot in README. * update: Make text selectable for copy & paste. * Update whisper.h to restore API name Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * whisper.android: Restore original API names. --------- Co-authored-by: tinoue <tinoue@xevo.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-07 21:36:30 +02:00
Georgi Gerganov	fa9d43181f	readme : add bench-wts.sh demo	2023-03-06 21:06:27 +02:00
Georgi Gerganov	bb6b54a03d	bench-wts.sh : rename script + add execute permission	2023-03-06 21:02:24 +02:00
venkr	b597c5a779	qual-bench.sh : add quality comparison tool, and update main.cpp to allow using a font file (#569 )	2023-03-06 19:18:11 +02:00
Takeshi Inoue	a3fb6c507f	whisper.android : enable fp16 instrinsics (FP16_VA) which is supported by ARMv8.2 or later. (#572 )	2023-03-06 19:15:57 +02:00
sandrohanea	59fdcd19c8	whisper : add whisper_state + default state on the whisper_context (#523 ) * Added whisper state + default state on the whisper_context * Fixed some examples and bindings * Fixed whisper_n_len (which was used in some binding) and added whisper_n_len_from_state * Fixed comments * whisper : reuse kv_cache_free() and fix compiler warnings * whisper : clean-up the API comments --------- Co-authored-by: Sandro Hanea <sandrohanea@microsoft.com> Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-03-05 21:42:19 +02:00
Georgi Gerganov	478289a4b3	whisper : set no_context == true by default (#537 )	2023-03-05 20:53:43 +02:00
polarmoon	5e94129cb2	go : NewContext now returns a clean context (#537 ) Co-authored-by: Ming <ming@localhost>	2023-03-05 20:50:25 +02:00
HY. Kelvin Lee	72af0f5697	main : add csv header (#552 )	2023-03-02 18:32:16 +02:00
Georgi Gerganov	af005d573f	make : add -DNDEBUG compile flag	2023-02-28 23:27:54 +02:00
Georgi Gerganov	ad1389003d	release : v1.2.1	2023-02-28 22:29:12 +02:00
FlippFuzz	f420de1322	make : add "-mcpu=native" when building for aarch64 (#532 )	2023-02-27 21:04:16 +02:00
Aaron Pham	d176160f6f	readme : add pybind11 bindings (#538 )	2023-02-27 21:02:11 +02:00