make : fix MUSL Linux build

2025-07-04 16:30:58 +02:00 · 2023-03-06 22:24:08 +02:00
40 changed files with 622 additions and 6092 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,5 @@
 *.o
 *.a
 *.mlmodel
 *.mlmodelc
 .cache/
 .vs/
 .vscode/
@ -34,5 +32,3 @@ examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
 examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
 extra/bench-gg.txt
 *.mlmodel*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -54,8 +54,6 @@ if (APPLE)
    option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
    option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
    option(WHISPER_NO_FMA              "whisper: disable FMA" OFF)
    option(WHISPER_COREML              "whisper: enable Core ML framework" OFF)
 else()
    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
@ -88,33 +86,16 @@ endif()
 find_package(Threads REQUIRED)
-# on APPLE
+# on APPLE - include Accelerate framework
-if (APPLE)
+if (APPLE AND NOT WHISPER_NO_ACCELERATE)
-    # include Accelerate framework
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (NOT WHISPER_NO_ACCELERATE)
+    if (ACCELERATE_FRAMEWORK)
-        find_library(ACCELERATE_FRAMEWORK Accelerate)
+        message(STATUS "Accelerate framework found")
-        if (ACCELERATE_FRAMEWORK)
+        set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-            message(STATUS "Accelerate framework found")
+        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
-
+    else()
-            set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+        message(WARNING "Accelerate framework not found")
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
        else()
            message(WARNING "Accelerate framework not found")
        endif()
    endif()
    if (WHISPER_COREML)
        find_library(FOUNDATION_FRAMEWORK Foundation)
        find_library(COREML_FRAMEWORK CoreML)
        if (COREML_FRAMEWORK)
            message(STATUS "CoreML framework found")
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
        else()
            message(WARNING "CoreML framework not found")
        endif()
    endif()
 endif()
@ -191,9 +172,7 @@ else()
            if(NOT WHISPER_NO_FMA)
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
            endif()
-            if(NOT WHISPER_NO_F16C)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
                set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
            endif()
        endif()
    endif()
 endif()
@ -202,33 +181,6 @@ if (WHISPER_PERF)
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
 endif()
 #
 # whisper.coreml - Core ML support
 #
 if (WHISPER_COREML)
    set(TARGET whisper.coreml)
    add_library(${TARGET}
        coreml/whisper-encoder.h
        coreml/whisper-encoder.mm
        coreml/whisper-encoder-impl.h
        coreml/whisper-encoder-impl.m
        )
    include(DefaultTargetOptions)
    target_include_directories(${TARGET} PUBLIC
        .
        )
    target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
    set_target_properties(${TARGET} PROPERTIES
        COMPILE_FLAGS "-fobjc-arc"
        )
 endif()
 #
 # whisper - this is the main library of the project
 #
@ -248,10 +200,6 @@ target_include_directories(${TARGET} PUBLIC
    .
    )
 if (WHISPER_COREML)
    target_link_libraries(${TARGET} PRIVATE whisper.coreml)
 endif()
 if (MSVC)
    target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
--- a/46
+++ b/46
@ -35,7 +35,7 @@ CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
 LDFLAGS  =
 # ref: https://github.com/ggerganov/whisper.cpp/issues/37
-ifneq ($(wildcard /usr/include/musl/*),)
+ifneq (,$(findstring musl,$(CCV)))
 	CFLAGS   += -D_POSIX_SOURCE -D_GNU_SOURCE
 	CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
 endif
@ -138,10 +138,6 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
 ifdef WHISPER_COREML
 	CXXFLAGS += -DWHISPER_USE_COREML
 	LDFLAGS  += -framework Foundation -framework CoreML
 endif
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
@ -194,23 +190,11 @@ ggml.o: ggml.c ggml.h
 whisper.o: whisper.cpp whisper.h
 	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
-ifndef WHISPER_COREML
+libwhisper.a: ggml.o whisper.o
-WHISPER_OBJ = whisper.o
+	$(AR) rcs libwhisper.a ggml.o whisper.o
 else
 whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
 	$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
-whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
+libwhisper.so: ggml.o whisper.o
-	$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
+	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
 WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
 endif
 libwhisper.a: ggml.o $(WHISPER_OBJ)
 	$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
 libwhisper.so: ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
 clean:
 	rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
@ -224,21 +208,21 @@ CC_SDL=`sdl2-config --cflags --libs`
 SRC_COMMON = examples/common.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp
-main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
+main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
 	./main -h
-stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
-command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
-talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
-bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
+bench: examples/bench/bench.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
 #
 # Audio samples
--- a/README.md
+++ b/README.md
@ -433,19 +433,6 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a
 ---
 ## Video comparison of different models
 Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:
 ```java
 ./extra/bench-wts.sh samples/jfk.wav
 ffplay ./samples/jfk.wav.all.mp4
 ```
 https://user-images.githubusercontent.com/1991296/223206245-2d36d903-cf8e-4f09-8c3b-eb9f9c39d6fc.mp4
 ---
 ## Benchmarks
 In order to have an objective comparison of the performance of the inference across different system configurations,
@ -466,7 +453,7 @@ The original models are converted to a custom binary format. This allows to pack
 You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
 or manually from here:
- https://huggingface.co/ggerganov/whisper.cpp
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com
 For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
@ -476,7 +463,6 @@ in [models](models).
 - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
 - [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
  - React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
 - [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
 - [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
@ -486,7 +472,6 @@ in [models](models).
 - [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
 - [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
 ## Examples
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@ -17,9 +17,9 @@ import (
 // CONSTANTS
 const (
-	srcUrl  = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models
+	srcUrl  = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main" // The location of the models
-	srcExt  = ".bin"                                                      // Filename extension
+	srcExt  = ".bin"                                                               // Filename extension
-	bufSize = 1024 * 64                                                   // Size of the buffer used for downloading the model
+	bufSize = 1024 * 64                                                            // Size of the buffer used for downloading the model
 )
 var (
--- a/coreml/whisper-encoder-impl.h
+++ b/coreml/whisper-encoder-impl.h
@ -1,142 +0,0 @@
 //
 // CoremlEncoder.h
 //
 // This file was automatically generated and should not be edited.
 //
 #import <Foundation/Foundation.h>
 #import <CoreML/CoreML.h>
 #include <stdint.h>
 #include <os/log.h>
 NS_ASSUME_NONNULL_BEGIN
 /// Model Prediction Input Type
 API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoderInput : NSObject<MLFeatureProvider>
 /// melSegment as 1 × 80 × 3000 3-dimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * melSegment;
 - (instancetype)init NS_UNAVAILABLE;
 - (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER;
@end
 /// Model Prediction Output Type
 API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoderOutput : NSObject<MLFeatureProvider>
 /// output as multidimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * output;
 - (instancetype)init NS_UNAVAILABLE;
 - (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
@end
 /// Class for model loading and prediction
 API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoder : NSObject
@property (readonly, nonatomic, nullable) MLModel * model;
 /**
    URL of the underlying .mlmodelc directory.
 */
 + (nullable NSURL *)URLOfModelInThisBundle;
 /**
    Initialize CoremlEncoder instance from an existing MLModel object.
    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
 */
 - (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
 */
 - (nullable instancetype)init;
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Construct CoremlEncoder instance asynchronously with configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
 /**
    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param modelURL The model URL.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
 /**
    Make a prediction using the standard interface
    @param input an instance of CoremlEncoderInput to predict from
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the prediction as CoremlEncoderOutput
 */
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Make a prediction using the standard interface
    @param input an instance of CoremlEncoderInput to predict from
    @param options prediction options
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the prediction as CoremlEncoderOutput
 */
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Make a prediction using the convenience interface
    @param melSegment as 1 × 80 × 3000 3-dimensional array of floats:
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the prediction as CoremlEncoderOutput
 */
 - (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Batch prediction
    @param inputArray array of CoremlEncoderInput instances to obtain predictions from
    @param options prediction options
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the predictions as NSArray<CoremlEncoderOutput *>
 */
 - (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
@end
 NS_ASSUME_NONNULL_END
--- a/coreml/whisper-encoder-impl.m
+++ b/coreml/whisper-encoder-impl.m
@ -1,197 +0,0 @@
 //
 // CoremlEncoder.m
 //
 // This file was automatically generated and should not be edited.
 //
 #if !__has_feature(objc_arc)
 #error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
 #endif
 #import "whisper-encoder-impl.h"
@implementation CoremlEncoderInput
 - (instancetype)initWithMelSegment:(MLMultiArray *)melSegment {
    self = [super init];
    if (self) {
        _melSegment = melSegment;
    }
    return self;
 }
 - (NSSet<NSString *> *)featureNames {
    return [NSSet setWithArray:@[@"melSegment"]];
 }
 - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
    if ([featureName isEqualToString:@"melSegment"]) {
        return [MLFeatureValue featureValueWithMultiArray:self.melSegment];
    }
    return nil;
 }
@end
@implementation CoremlEncoderOutput
 - (instancetype)initWithOutput:(MLMultiArray *)output {
    self = [super init];
    if (self) {
        _output = output;
    }
    return self;
 }
 - (NSSet<NSString *> *)featureNames {
    return [NSSet setWithArray:@[@"output"]];
 }
 - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
    if ([featureName isEqualToString:@"output"]) {
        return [MLFeatureValue featureValueWithMultiArray:self.output];
    }
    return nil;
 }
@end
@implementation CoremlEncoder
 /**
    URL of the underlying .mlmodelc directory.
 */
 + (nullable NSURL *)URLOfModelInThisBundle {
    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"];
    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; }
    return [NSURL fileURLWithPath:assetPath];
 }
 /**
    Initialize CoremlEncoder instance from an existing MLModel object.
    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
 */
 - (instancetype)initWithMLModel:(MLModel *)model {
    self = [super init];
    if (!self) { return nil; }
    _model = model;
    if (_model == nil) { return nil; }
    return self;
 }
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
 */
 - (nullable instancetype)init {
    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
 }
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
 }
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
    if (model == nil) { return nil; }
    return [self initWithMLModel:model];
 }
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
    if (model == nil) { return nil; }
    return [self initWithMLModel:model];
 }
 /**
    Construct CoremlEncoder instance asynchronously with configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
              configuration:configuration
          completionHandler:handler];
 }
 /**
    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param modelURL The model URL.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
    [MLModel loadContentsOfURL:modelURL
                 configuration:configuration
             completionHandler:^(MLModel *model, NSError *error) {
        if (model != nil) {
            CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model];
            handler(typedModel, nil);
        } else {
            handler(nil, error);
        }
    }];
 }
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
 }
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
    if (!outFeatures) { return nil; }
    return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
 }
 - (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment];
    return [self predictionFromFeatures:input_ error:error];
 }
 - (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
    if (!outBatch) { return nil; }
    NSMutableArray<CoremlEncoderOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
    for (NSInteger i = 0; i < outBatch.count; i++) {
        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
        CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
        [results addObject:result];
    }
    return results;
 }
@end
--- a/coreml/whisper-encoder.h
+++ b/coreml/whisper-encoder.h
@ -1,22 +0,0 @@
 // Wrapper of the Core ML Whisper Encoder model
 //
 // Code is derived from the work of Github user @wangchou
 // ref: https://github.com/wangchou/callCoreMLFromCpp
 #if __cplusplus
 extern "C" {
 #endif
 struct whisper_coreml_context;
 struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
 void whisper_coreml_free(struct whisper_coreml_context * ctx);
 void whisper_coreml_encode(
        const whisper_coreml_context * ctx,
                               float * mel,
                               float * out);
 #if __cplusplus
 }
 #endif
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -1,61 +0,0 @@
 #import "coreml/whisper-encoder.h"
 #import "coreml/whisper-encoder-impl.h"
 #import <CoreML/CoreML.h>
 #include <stdlib.h>
 #if __cplusplus
 extern "C" {
 #endif
 struct whisper_coreml_context {
    const void * data;
 };
 struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
    NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
    NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
    const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]);
    if (data == NULL) {
        return NULL;
    }
    whisper_coreml_context * ctx = new whisper_coreml_context;
    ctx->data = data;
    return ctx;
 }
 void whisper_coreml_free(struct whisper_coreml_context * ctx) {
    CFRelease(ctx->data);
    delete ctx;
 }
 void whisper_coreml_encode(
        const whisper_coreml_context * ctx,
                               float * mel,
                               float * out) {
    MLMultiArray * inMultiArray = [
        [MLMultiArray alloc] initWithDataPointer: mel
                                           shape: @[@1, @80, @3000]
                                        dataType: MLMultiArrayDataTypeFloat32
                                         strides: @[@(240000), @(3000), @1]
                                     deallocator: nil
                                           error: nil
    ];
    CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil];
    MLMultiArray * outMA = outCoreML.output;
    memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
 }
 #if __cplusplus
 }
 #endif
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@ -63,5 +63,4 @@ else()
    add_subdirectory(command)
    add_subdirectory(bench)
    add_subdirectory(talk)
    add_subdirectory(talk.llama)
 endif()
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -292,64 +292,51 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
    return 0;
 }
-class Worker : public Napi::AsyncWorker {
+Napi::Object whisper(const Napi::CallbackInfo& info) {
- public:
+    Napi::Env env = info.Env();
-  Worker(Napi::Function& callback, whisper_params params)
+    if (info.Length() <= 0 || !info[0].IsObject()) {
-      : Napi::AsyncWorker(callback), params(params) {}
+        Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
  void Execute() override {
    run(params, result);
  }
  void OnOK() override {
    Napi::HandleScope scope(Env());
    Napi::Object res = Napi::Array::New(Env(), result.size());
    for (uint64_t i = 0; i < result.size(); ++i) {
      Napi::Object tmp = Napi::Array::New(Env(), 3);
      for (uint64_t j = 0; j < 3; ++j) {
        tmp[j] = Napi::String::New(Env(), result[i][j]);
      }
      res[i] = tmp;
    }
-    Callback().Call({Env().Null(), res});
+    whisper_params params;
-  }
+    std::vector<std::vector<std::string>> result;
- private:
+    Napi::Object whisper_params = info[0].As<Napi::Object>();
-  whisper_params params;
+    std::string language = whisper_params.Get("language").As<Napi::String>();
-  std::vector<std::vector<std::string>> result;
+    std::string model = whisper_params.Get("model").As<Napi::String>();
-};
+    std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
    params.language = language;
    params.model = model;
    params.fname_inp.emplace_back(input);
    // run model
    run(params, result);
-Napi::Value whisper(const Napi::CallbackInfo& info) {
+    fprintf(stderr, "RESULT:\n");
-  Napi::Env env = info.Env();
+    for (auto sentence:result) {
-  if (info.Length() <= 0 || !info[0].IsObject()) {
+        fprintf(stderr, "t0: %s, t1: %s, content: %s \n",
-    Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
+                sentence[0].c_str(), sentence[1].c_str(), sentence[2].c_str());
-  }
+    }
  whisper_params params;
-  Napi::Object whisper_params = info[0].As<Napi::Object>();
+    Napi::Object res = Napi::Array::New(env, result.size());
-  std::string language = whisper_params.Get("language").As<Napi::String>();
+    for (uint64_t i = 0; i < result.size(); ++i) {
-  std::string model = whisper_params.Get("model").As<Napi::String>();
+        Napi::Object tmp = Napi::Array::New(env, 3);
-  std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
+        for (uint64_t j = 0; j < 3; ++j) {
            tmp[j] = Napi::String::New(env, result[i][j]);
        }
        res[i] = tmp;
    }
-  params.language = language;
+    return res;
  params.model = model;
  params.fname_inp.emplace_back(input);
  Napi::Function callback = info[1].As<Napi::Function>();
  Worker* worker = new Worker(callback, params);
  worker->Queue();
  return env.Undefined();
 }
 Napi::Object Init(Napi::Env env, Napi::Object exports) {
-  exports.Set(
+    exports.Set(
-      Napi::String::New(env, "whisper"),
+            Napi::String::New(env, "whisper"),
-      Napi::Function::New(env, whisper)
+            Napi::Function::New(env, whisper)
-  );
+    );
-  return exports;
+    return exports;
 }
 NODE_API_MODULE(whisper, Init);
--- a/examples/addon.node/index.js
+++ b/examples/addon.node/index.js
@ -1,36 +1,27 @@
-const path = require("path");
+const path = require('path');
-const { whisper } = require(path.join(
+const { whisper } = require(path.join(__dirname, '../../build/Release/whisper-addon'));
  __dirname,
  "../../build/Release/whisper-addon"
 ));
 const { promisify } = require("util");
 const whisperAsync = promisify(whisper);
 const whisperParams = {
-  language: "en",
+    language: 'en',
-  model: path.join(__dirname, "../../models/ggml-base.en.bin"),
+    model: path.join(__dirname, '../../models/ggml-base.en.bin'),
-  fname_inp: "../../samples/jfk.wav",
+    fname_inp: '',
 };
 const arguments = process.argv.slice(2);
 const params = Object.fromEntries(
-  arguments.reduce((pre, item) => {
+    arguments.reduce((pre, item) => {
-    if (item.startsWith("--")) {
+        if (item.startsWith("--")) {
-      return [...pre, item.slice(2).split("=")];
+            return [...pre, item.slice(2).split("=")];
-    }
+        }
-    return pre;
+        return pre;
-  }, [])
+    }, []),
 );
 for (const key in params) {
-  if (whisperParams.hasOwnProperty(key)) {
+    if (whisperParams.hasOwnProperty(key)) {
-    whisperParams[key] = params[key];
+        whisperParams[key] = params[key];
-  }
+    }
 }
-console.log("whisperParams =", whisperParams);
+console.log('whisperParams =', whisperParams);
-
+console.log(whisper(whisperParams));
 whisperAsync(whisperParams).then((result) => {
  console.log(`Result from whisper: ${result}`);
 });
--- a/examples/main/README.md
+++ b/examples/main/README.md
@ -31,7 +31,6 @@ options:
  -osrt,     --output-srt        [false  ] output result in a srt file
  -owts,     --output-words      [false  ] output script for generating karaoke video
  -ocsv,     --output-csv        [false  ] output result in a CSV file
  -oj,       --output-json       [false  ] output result in a JSON file
  -of FNAME, --output-file FNAME [       ] output file path (without file extension)
  -ps,       --print-special     [false  ] print special tokens
  -pc,       --print-colors      [false  ] print colors
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -73,7 +73,6 @@ struct whisper_params {
    bool output_srt     = false;
    bool output_wts     = false;
    bool output_csv     = false;
    bool output_jsn     = false;
    bool print_special  = false;
    bool print_colors   = false;
    bool print_progress = false;
@ -131,7 +130,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
        else if (arg == "-fp"   || arg == "--font-path")      { params.font_path      = argv[++i]; }
        else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
        else if (arg == "-oj"   || arg == "--output-json")    { params.output_jsn     = true; }
        else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
        else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
@ -180,7 +178,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
    fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
    fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
    fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
    fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
    fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
@ -371,129 +368,6 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
    return true;
 }
 bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
    std::ofstream fout(fname);
    int indent = 0;
    auto doindent = [&]() {
        for (int i = 0; i < indent; i++) fout << "\t";
    };
    auto start_arr = [&](const char *name) {
        doindent();
        fout << "\"" << name << "\": [\n";
        indent++;
    };
    auto end_arr = [&](bool end = false) {
        indent--;
        doindent();
        fout << (end ? "]\n" : "},\n");
    };
    auto start_obj = [&](const char *name = nullptr) {
        doindent();
        if (name) {
            fout << "\"" << name << "\": {\n";
        } else {
            fout << "{\n";
        }
        indent++;
    };
    auto end_obj = [&](bool end = false) {
        indent--;
        doindent();
        fout << (end ? "}\n" : "},\n");
    };
    auto start_value = [&](const char *name) {
        doindent();
        fout << "\"" << name << "\": ";
    };
    auto value_s = [&](const char *name, const char *val, bool end = false) {
        start_value(name);
        fout << "\"" << val << (end ? "\"\n" : "\",\n");
    };
    auto end_value = [&](bool end = false) {
        fout << (end ? "\n" : ",\n");
    };
    auto value_i = [&](const char *name, const int64_t val, bool end = false) {
        start_value(name);
        fout << val;
        end_value(end);
    };
    auto value_b = [&](const char *name, const bool val, bool end = false) {
        start_value(name);
        fout << (val ? "true" : "false");
        end_value(end);
    };
    if (!fout.is_open()) {
        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
        return false;
    }
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
    start_obj();
        value_s("systeminfo", whisper_print_system_info());
        start_obj("model");
            value_s("type", whisper_model_type_readable(ctx));
            value_b("multilingual", whisper_is_multilingual(ctx));
            value_i("vocab", whisper_model_n_vocab(ctx));
            start_obj("audio");
                value_i("ctx", whisper_model_n_audio_ctx(ctx));
                value_i("state", whisper_model_n_audio_state(ctx));
                value_i("head", whisper_model_n_audio_head(ctx));
                value_i("layer", whisper_model_n_audio_layer(ctx), true);
            end_obj();
            start_obj("text");
                value_i("ctx", whisper_model_n_text_ctx(ctx));
                value_i("state", whisper_model_n_text_state(ctx));
                value_i("head", whisper_model_n_text_head(ctx));
                value_i("leyer", whisper_model_n_text_layer(ctx), true);
            end_obj();
            value_i("mels", whisper_model_n_mels(ctx));
            value_i("f16", whisper_model_f16(ctx), true);
        end_obj();
        start_obj("params");
            value_s("model", params.model.c_str());
            value_s("language", params.language.c_str());
            value_b("translate", params.translate, true);
        end_obj();
        start_obj("result");
            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
        end_obj();
        start_arr("transcription");
            const int n_segments = whisper_full_n_segments(ctx);
            for (int i = 0; i < n_segments; ++i) {
                const char * text = whisper_full_get_segment_text(ctx, i);
                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
                start_obj();
                    start_obj("timestanps");
                        value_s("from", to_timestamp(t0, true).c_str());
                        value_s("to", to_timestamp(t1, true).c_str(), true);
                    end_obj();
                    start_obj("offsets");
                        value_i("from", t0 * 10);
                        value_i("to", t1 * 10, true);
                    end_obj();
                    value_s("text", text, true);
                end_obj(i == (n_segments - 1));
            }
        end_arr(true);
    end_obj(true);
    return true;
 }
 // karaoke video generation
 // outputs a bash script that uses ffmpeg to generate a video with the subtitles
 // TODO: font parameter adjustments
@ -788,12 +662,6 @@ int main(int argc, char ** argv) {
                const auto fname_csv = fname_out + ".csv";
                output_csv(ctx, fname_csv.c_str());
            }
            // output to JSON file
            if (params.output_jsn) {
                const auto fname_jsn = fname_out + ".json";
                output_json(ctx, fname_jsn.c_str(), params);
            }
        }
    }
--- a/examples/talk.llama/.gitignore
+++ b/examples/talk.llama/.gitignore
@ -1,2 +0,0 @@
 eleven-labs.py
 audio.mp3
--- a/examples/talk.llama/CMakeLists.txt
+++ b/examples/talk.llama/CMakeLists.txt
@ -1,12 +0,0 @@
 if (WHISPER_SUPPORT_SDL2)
    # talk.llama
    set(TARGET talk-llama)
    # TODO: this is temporary
    #       need to export ggml symbols for MSVC, but too lazy ..
    add_executable(${TARGET} talk-llama.cpp llama.cpp)
    include(DefaultTargetOptions)
    target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
 endif ()
--- a/examples/talk.llama/README.md
+++ b/examples/talk.llama/README.md
@ -1,2 +0,0 @@
 # talk.llama
--- a/examples/talk.llama/llama.cpp
+++ b/examples/talk.llama/llama.cpp
--- a/examples/talk.llama/llama.h
+++ b/examples/talk.llama/llama.h
@ -1,153 +0,0 @@
 #ifndef LLAMA_H
 #define LLAMA_H
 #include <stddef.h>
 #include <stdint.h>
 #include <stdbool.h>
 #ifdef LLAMA_SHARED
 #    ifdef _WIN32
 #        ifdef LLAMA_BUILD
 #            define LLAMA_API __declspec(dllexport)
 #        else
 #            define LLAMA_API __declspec(dllimport)
 #        endif
 #    else
 #        define LLAMA_API __attribute__ ((visibility ("default")))
 #    endif
 #else
 #    define LLAMA_API
 #endif
 #define LLAMA_FILE_VERSION 1
 #define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
 #define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
 #ifdef __cplusplus
 extern "C" {
 #endif
    //
    // C interface
    //
    // TODO: show sample usage
    //
    struct llama_context;
    typedef int llama_token;
    typedef struct llama_token_data {
        llama_token id;  // token id
        float p;     // probability of the token
        float plog;  // log probability of the token
    } llama_token_data;
    typedef void (*llama_progress_callback)(double progress, void *ctx);
    struct llama_context_params {
        int n_ctx;   // text context
        int n_parts; // -1 for default
        int seed;    // RNG seed, 0 for random
        bool f16_kv;     // use fp16 for KV cache
        bool logits_all; // the llama_eval() call computes all logits, not just the last one
        bool vocab_only; // only load the vocabulary, no weights
        bool use_mlock;  // force system to keep model in RAM
        bool embedding;  // embedding mode only
        // called with a progress value between 0 and 1, pass NULL to disable
        llama_progress_callback progress_callback;
        // context pointer passed to the progress callback
        void * progress_callback_user_data;
    };
    LLAMA_API struct llama_context_params llama_context_default_params();
    // Various functions for loading a ggml llama model.
    // Allocate (almost) all memory needed for the model.
    // Return NULL on failure
    LLAMA_API struct llama_context * llama_init_from_file(
                             const char * path_model,
            struct llama_context_params   params);
    // Frees all allocated memory
    LLAMA_API void llama_free(struct llama_context * ctx);
    // TODO: not great API - very likely to change
    // Returns 0 on success
    LLAMA_API int llama_model_quantize(
            const char * fname_inp,
            const char * fname_out,
                   int   itype,
                   int   qk);
    // Run the llama inference to obtain the logits and probabilities for the next token.
    // tokens + n_tokens is the provided batch of new tokens to process
    // n_past is the number of tokens to use from previous eval calls
    // Returns 0 on success
    LLAMA_API int llama_eval(
            struct llama_context * ctx,
               const llama_token * tokens,
                             int   n_tokens,
                             int   n_past,
                             int   n_threads);
    // Convert the provided text into tokens.
    // The tokens pointer must be large enough to hold the resulting tokens.
    // Returns the number of tokens on success, no more than n_max_tokens
    // Returns a negative number on failure - the number of tokens that would have been returned
    // TODO: not sure if correct
    LLAMA_API int llama_tokenize(
            struct llama_context * ctx,
                      const char * text,
                     llama_token * tokens,
                             int   n_max_tokens,
                            bool   add_bos);
    LLAMA_API int llama_n_vocab(struct llama_context * ctx);
    LLAMA_API int llama_n_ctx  (struct llama_context * ctx);
    LLAMA_API int llama_n_embd (struct llama_context * ctx);
    // Token logits obtained from the last call to llama_eval()
    // The logits for the last token are stored in the last row
    // Can be mutated in order to change the probabilities of the next token
    // Rows: n_tokens
    // Cols: n_vocab
    LLAMA_API float * llama_get_logits(struct llama_context * ctx);
    // Get the embeddings for the input
    // shape: [n_embd] (1-dimensional)
    LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
    // Token Id -> String. Uses the vocabulary in the provided context
    LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
    // Special tokens
    LLAMA_API llama_token llama_token_bos();
    LLAMA_API llama_token llama_token_eos();
    // TODO: improve the last_n_tokens interface ?
    LLAMA_API llama_token llama_sample_top_p_top_k(
       struct llama_context * ctx,
          const llama_token * last_n_tokens_data,
                        int   last_n_tokens_size,
                        int   top_k,
                     double   top_p,
                     double   temp,
                     double   repeat_penalty);
    // Performance information
    LLAMA_API void llama_print_timings(struct llama_context * ctx);
    LLAMA_API void llama_reset_timings(struct llama_context * ctx);
    // Print system information
    LLAMA_API const char * llama_print_system_info(void);
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/examples/talk.llama/speak.sh
+++ b/examples/talk.llama/speak.sh
@ -1,20 +0,0 @@
 #!/bin/bash
 # Usage:
 #  speak.sh <voice_id> <text-to-speak>
 # espeak
 # Mac OS: brew install espeak
 # Linux: apt-get install espeak
 #
 #espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
 # for Mac
 say "$2"
 # Eleven Labs
 #
 #wd=$(dirname $0)
 #script=$wd/eleven-labs.py
 #python3 $script $1 "$2" >/dev/null 2>&1
 #ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1
--- a/examples/talk.llama/talk-llama.cpp
+++ b/examples/talk.llama/talk-llama.cpp
@ -1,511 +0,0 @@
 // Talk with AI
 //
 #include "common.h"
 #include "common-sdl.h"
 #include "whisper.h"
 #include "llama.h"
 #include <cassert>
 #include <cstdio>
 #include <fstream>
 #include <regex>
 #include <string>
 #include <thread>
 #include <vector>
 #include <regex>
 std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
    // initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
    std::vector<llama_token> res(text.size() + (int)add_bos);
    int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
    assert(n >= 0);
    res.resize(n);
    return res;
 }
 // command-line parameters
 struct whisper_params {
    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t voice_ms   = 10000;
    int32_t capture_id = -1;
    int32_t max_tokens = 32;
    int32_t audio_ctx  = 0;
    float vad_thold    = 0.6f;
    float freq_thold   = 100.0f;
    bool speed_up      = false;
    bool translate     = false;
    bool print_special = false;
    bool print_energy  = false;
    bool no_timestamps = true;
    std::string person      = "Santa";
    std::string language    = "en";
    std::string model_wsp   = "models/ggml-base.en.bin";
    std::string model_llama = "models/ggml-llama-7B.bin";
    std::string speak       = "./examples/talk/speak.sh";
    std::string fname_out;
 };
 void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
 bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
    for (int i = 1; i < argc; i++) {
        std::string arg = argv[i];
        if (arg == "-h" || arg == "--help") {
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
        else if (arg == "-vms" || arg == "--voice-ms")      { params.voice_ms      = std::stoi(argv[++i]); }
        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
        else if (arg == "-p"   || arg == "--person")        { params.person        = argv[++i]; }
        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
        else if (arg == "-mw"  || arg == "--model-whisper") { params.model_wsp     = argv[++i]; }
        else if (arg == "-ml"  || arg == "--model-llama")   { params.model_llama   = argv[++i]; }
        else if (arg == "-s"   || arg == "--speak")         { params.speak         = argv[++i]; }
        else if (arg == "-f"   || arg == "--file")          { params.fname_out     = argv[++i]; }
        else {
            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
            whisper_print_usage(argc, argv, params);
            exit(0);
        }
    }
    return true;
 }
 void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
    fprintf(stderr, "\n");
    fprintf(stderr, "usage: %s [options]\n", argv[0]);
    fprintf(stderr, "\n");
    fprintf(stderr, "options:\n");
    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
    fprintf(stderr, "  -vms N,   --voice-ms N    [%-7d] voice duration in milliseconds\n",              params.voice_ms);
    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
    fprintf(stderr, "  -p NAME,  --person NAME   [%-7s] person name (for prompt selection)\n",          params.person.c_str());
    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
    fprintf(stderr, "  -mw FILE, --model-whisper [%-7s] whisper model file\n",                          params.model_wsp.c_str());
    fprintf(stderr, "  -mg FILE, --model-llama   [%-7s] llama model file\n",                            params.model_llama.c_str());
    fprintf(stderr, "  -s FILE,  --speak TEXT    [%-7s] command for TTS\n",                             params.speak.c_str());
    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] text output file name\n",                       params.fname_out.c_str());
    fprintf(stderr, "\n");
 }
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();
    prob = 0.0f;
    t_ms = 0;
    whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    wparams.print_progress   = false;
    wparams.print_special    = params.print_special;
    wparams.print_realtime   = false;
    wparams.print_timestamps = !params.no_timestamps;
    wparams.translate        = params.translate;
    wparams.no_context       = true;
    wparams.single_segment   = true;
    wparams.max_tokens       = params.max_tokens;
    wparams.language         = params.language.c_str();
    wparams.n_threads        = params.n_threads;
    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;
    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
    }
    int prob_n = 0;
    std::string result;
    const int n_segments = whisper_full_n_segments(ctx);
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        result += text;
        const int n_tokens = whisper_full_n_tokens(ctx, i);
        for (int j = 0; j < n_tokens; ++j) {
            const auto token = whisper_full_get_token_data(ctx, i, j);
            prob += token.p;
            ++prob_n;
        }
    }
    if (prob_n > 0) {
        prob /= prob_n;
    }
    const auto t_end = std::chrono::high_resolution_clock::now();
    t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
    return result;
 }
 // need to have leading ' '
 //const std::string k_prompt = R"( Transcript of a dialog, where {1} interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer {1}'s requests immediately and with precision.
 //
 //{0}: Hello, Bob.
 //{1}: Hello {0}. How may I help you today?
 //{0}:)";
 const std::string k_prompt = R"( Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
 {1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}’s requests immediately and with details and precision.
 There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
 The transcript only includes text, it does not include markup like HTML and Markdown.
 {1} answers responds with short and concise answers.
 {0}{4} Hello, {1}!
 {1}{4} Hello {0}! How may I help you today?
 {0}{4} What time is it?
 {1}{4} It is {2} o'clock.
 {0}{4} What year is it?
 {1}{4} We are in {3}.
 {0}{4} What is a cat?
 {1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
 {0}{4} Name a color.
 {1}{4} Blue
 {0}{4})";
 int main(int argc, char ** argv) {
    whisper_params params;
    if (whisper_params_parse(argc, argv, params) == false) {
        return 1;
    }
    if (whisper_lang_id(params.language.c_str()) == -1) {
        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
        whisper_print_usage(argc, argv, params);
        exit(0);
    }
    // whisper init
    struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
    // llama init
    auto lparams = llama_context_default_params();
    lparams.n_ctx      = 512;
    lparams.n_parts    = 2; // TODO fix
    lparams.seed       = 1; // TODO fix
    lparams.f16_kv     = true;
    struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
    // print some info about the processing
    {
        fprintf(stderr, "\n");
        if (!whisper_is_multilingual(ctx_wsp)) {
            if (params.language != "en" || params.translate) {
                params.language = "en";
                params.translate = false;
                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
            }
        }
        fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
                __func__,
                params.n_threads,
                params.language.c_str(),
                params.translate ? "translate" : "transcribe",
                params.no_timestamps ? 0 : 1);
        fprintf(stderr, "\n");
    }
    // init audio
    audio_async audio(30*1000);
    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
        return 1;
    }
    audio.resume();
    int n_iter = 0;
    bool is_running  = true;
    bool force_speak = false;
    float prob0 = 0.0f;
    const std::string chat_symb = ":";
    const std::string bot_name = "LLAMA";
    std::vector<float> pcmf32_cur;
    std::vector<float> pcmf32_prompt;
    std::string prompt_org = k_prompt;
    prompt_org = ::replace(prompt_org, "{0}", params.person);
    prompt_org = ::replace(prompt_org, "{1}", bot_name);
    {
        // get time string
        std::string time_str;
        {
            time_t t = time(0);
            struct tm * now = localtime(&t);
            char buf[128];
            strftime(buf, sizeof(buf), "%H:%M", now);
            time_str = buf;
        }
        prompt_org = ::replace(prompt_org, "{2}", time_str);
    }
    {
        // get year string
        std::string year_str;
        {
            time_t t = time(0);
            struct tm * now = localtime(&t);
            char buf[128];
            strftime(buf, sizeof(buf), "%Y", now);
            year_str = buf;
        }
        prompt_org = ::replace(prompt_org, "{3}", year_str);
    }
    prompt_org = ::replace(prompt_org, "{4}", chat_symb);
    auto embd_inp = ::llama_tokenize(ctx_llama, prompt_org, true);
    const int n_ctx = llama_n_ctx(ctx_llama);
    printf("\n");
    printf("%s : initializing - please wait ...\n", __func__);
    if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0, params.n_threads)) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return 1;
    }
    //fprintf(stdout, "\n");
    //fprintf(stdout, "%s", prompt_org.c_str());
    //fflush(stdout);
    printf("%s : done! start speaking in the microphone\n", __func__);
    printf("\n");
    printf("%s%s", params.person.c_str(), chat_symb.c_str());
    fflush(stdout);
    audio.clear();
    const int n_keep = embd_inp.size();
    const int voice_id = 2;
    int n_past = n_keep;
    int n_prev = 64; // TODO arg
    std::vector<llama_token> embd;
    std::vector<std::string> antiprompts = {
        params.person + chat_symb,
    };
    // main loop
    while (is_running) {
        // handle Ctrl + C
        is_running = sdl_poll_events();
        if (!is_running) {
            break;
        }
        // delay
        std::this_thread::sleep_for(std::chrono::milliseconds(100));
        int64_t t_ms = 0;
        {
            audio.get(2000, pcmf32_cur);
            if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
                //fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
                audio.get(params.voice_ms, pcmf32_cur);
                std::string text_heard;
                if (!force_speak) {
                    text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
                }
                // remove text between brackets using regex
                {
                    std::regex re("\\[.*?\\]");
                    text_heard = std::regex_replace(text_heard, re, "");
                }
                // remove text between brackets using regex
                {
                    std::regex re("\\(.*?\\)");
                    text_heard = std::regex_replace(text_heard, re, "");
                }
                // remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
                text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
                // take first line
                text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
                // remove leading and trailing whitespace
                text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
                text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
                const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
                if (text_heard.empty() || tokens.empty() || force_speak) {
                    //fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
                    audio.clear();
                    continue;
                }
                force_speak = false;
                text_heard.insert(0, 1, ' ');
                text_heard += "\n" + bot_name + chat_symb;
                fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
                fflush(stdout);
                embd = ::llama_tokenize(ctx_llama, text_heard, false);
                // text inference
                bool done = false;
                std::string text_to_speak;
                while (true) {
                    // predict
                    if (embd.size() > 0) {
                        if (n_past + (int) embd.size() > n_ctx) {
                            n_past = n_keep;
                            // insert n_left/2 tokens at the start of embd from last_n_tokens
                            embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
                            //printf("\n---\n");
                            //printf("resetting: '");
                            //for (int i = 0; i < (int) embd.size(); i++) {
                            //    printf("%s", llama_token_to_str(ctx_llama, embd[i]));
                            //}
                            //printf("'\n");
                            //printf("\n---\n");
                        }
                        if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
                            fprintf(stderr, "%s : failed to eval\n", __func__);
                            return 1;
                        }
                    }
                    //printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
                    embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
                    n_past += embd.size();
                    embd.clear();
                    if (done) break;
                    {
                        // out of user input, sample next token
                        const float top_k          = 5;
                        const float top_p          = 0.80f;
                        const float temp           = 0.30f;
                        const float repeat_penalty = 1.1764f;
                        const int repeat_last_n    = 256;
                        llama_token id = 0;
                        {
                            //auto logits = llama_get_logits(ctx_llama);
                            //logits[llama_token_eos()] = 0;
                            id = llama_sample_top_p_top_k(ctx_llama,
                                    embd_inp.data() + std::max(0, n_past - repeat_last_n),
                                    repeat_last_n, top_k, top_p, temp, repeat_penalty);
                        }
                        if (id != llama_token_eos()) {
                            // add it to the context
                            embd.push_back(id);
                            text_to_speak += llama_token_to_str(ctx_llama, id);
                            printf("%s", llama_token_to_str(ctx_llama, id));
                        } else {
                            // TODO
                            printf("EOS TOKEN - SHOULD NOT HAPPEN\n");
                            exit(0);
                        }
                    }
                    {
                        std::string last_output;
                        for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
                            last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
                        }
                        last_output += llama_token_to_str(ctx_llama, embd[0]);
                        for (std::string & antiprompt : antiprompts) {
                            if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
                                done = true;
                                text_to_speak = ::replace(text_to_speak, antiprompt, "");
                                fflush(stdout);
                                break;
                            }
                        }
                    }
                    is_running = sdl_poll_events();
                    if (!is_running) {
                        break;
                    }
                }
                text_to_speak = ::replace(text_to_speak, "\"", "");
                system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
                audio.clear();
                ++n_iter;
            }
        }
    }
    audio.pause();
    whisper_print_timings(ctx_wsp);
    whisper_free(ctx_wsp);
    return 0;
 }
--- a/examples/talk/README.md
+++ b/examples/talk/README.md
@ -31,7 +31,7 @@ To run this, you will need a ggml GPT-2 model: [instructions](https://github.com
 Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:
 ```
-wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
+wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
 ```
 ## TTS
--- a/examples/whisper.android/README.md
+++ b/examples/whisper.android/README.md
@ -9,4 +9,4 @@ To use:
 5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
 [^1]: I recommend the tiny or base models for running on an Android device.
-<img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">
+<img width="300" alt="image" src="https://user-images.githubusercontent.com/1991296/208154256-82d972dc-221b-48c4-bfcb-36ce68602f93.png">
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreen.kt
@ -2,7 +2,6 @@ package com.whispercppdemo.ui.main
 import androidx.compose.foundation.layout.*
 import androidx.compose.foundation.rememberScrollState
 import androidx.compose.foundation.text.selection.SelectionContainer
 import androidx.compose.foundation.verticalScroll
 import androidx.compose.material3.*
 import androidx.compose.runtime.Composable
@ -20,7 +19,6 @@ fun MainScreen(viewModel: MainScreenViewModel) {
        canTranscribe = viewModel.canTranscribe,
        isRecording = viewModel.isRecording,
        messageLog = viewModel.dataLog,
        onBenchmarkTapped = viewModel::benchmark,
        onTranscribeSampleTapped = viewModel::transcribeSample,
        onRecordTapped = viewModel::toggleRecord
    )
@ -32,7 +30,6 @@ private fun MainScreen(
    canTranscribe: Boolean,
    isRecording: Boolean,
    messageLog: String,
    onBenchmarkTapped: () -> Unit,
    onTranscribeSampleTapped: () -> Unit,
    onRecordTapped: () -> Unit
 ) {
@ -48,11 +45,8 @@ private fun MainScreen(
                .padding(innerPadding)
                .padding(16.dp)
        ) {
-            Column(verticalArrangement = Arrangement.SpaceBetween) {
+            Row(horizontalArrangement = Arrangement.SpaceBetween) {
-                Row(horizontalArrangement = Arrangement.SpaceBetween, modifier = Modifier.fillMaxWidth()) {
+                TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
                    BenchmarkButton(enabled = canTranscribe, onClick = onBenchmarkTapped)
                    TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
                }
                RecordButton(
                    enabled = canTranscribe,
                    isRecording = isRecording,
@ -66,16 +60,7 @@ private fun MainScreen(
@Composable
 private fun MessageLog(log: String) {
-    SelectionContainer() {
+    Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
        Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
    }
 }
@Composable
 private fun BenchmarkButton(enabled: Boolean, onClick: () -> Unit) {
    Button(onClick = onClick, enabled = enabled) {
        Text("Benchmark")
    }
 }
@Composable
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/ui/main/MainScreenViewModel.kt
@ -41,15 +41,10 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
    init {
        viewModelScope.launch {
            printSystemInfo()
            loadData()
        }
    }
    private suspend fun printSystemInfo() {
        printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
    }
    private suspend fun loadData() {
        printMessage("Loading data...\n")
        try {
@ -86,29 +81,10 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        //whisperContext = WhisperContext.createContextFromFile(firstModel.absolutePath)
    }
    fun benchmark() = viewModelScope.launch {
        runBenchmark(6)
    }
    fun transcribeSample() = viewModelScope.launch {
        transcribeAudio(getFirstSample())
    }
    private suspend fun runBenchmark(nthreads: Int) {
        if (!canTranscribe) {
            return
        }
        canTranscribe = false
        printMessage("Running benchmark. This will take minutes...\n")
        whisperContext?.benchMemory(nthreads)?.let{ printMessage(it) }
        printMessage("\n")
        whisperContext?.benchGgmlMulMat(nthreads)?.let{ printMessage(it) }
        canTranscribe = true
    }
    private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) {
        samplesPath.listFiles()!!.first()
    }
@ -138,14 +114,11 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
        canTranscribe = false
        try {
-            printMessage("Reading wave samples... ")
+            printMessage("Reading wave samples...\n")
            val data = readAudioSamples(file)
            printMessage("${data.size / (16000 / 1000)} ms\n")
            printMessage("Transcribing data...\n")
            val start = System.currentTimeMillis()
            val text = whisperContext?.transcribeData(data)
-            val elapsed = System.currentTimeMillis() - start
+            printMessage("Done: $text\n")
            printMessage("Done ($elapsed ms): $text\n")
        } catch (e: Exception) {
            Log.w(LOG_TAG, e)
            printMessage("${e.localizedMessage}\n")
--- a/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
+++ b/examples/whisper.android/app/src/main/java/com/whispercppdemo/whisper/LibWhisper.kt
@ -27,14 +27,6 @@ class WhisperContext private constructor(private var ptr: Long) {
        }
    }
    suspend fun benchMemory(nthreads: Int): String = withContext(scope.coroutineContext) {
        return@withContext WhisperLib.benchMemcpy(nthreads)
    }
    suspend fun benchGgmlMulMat(nthreads: Int): String = withContext(scope.coroutineContext) {
        return@withContext WhisperLib.benchGgmlMulMat(nthreads)
    }
    suspend fun release() = withContext(scope.coroutineContext) {
        if (ptr != 0L) {
            WhisperLib.freeContext(ptr)
@ -74,10 +66,6 @@ class WhisperContext private constructor(private var ptr: Long) {
            }
            return WhisperContext(ptr)
        }
        fun getSystemInfo(): String {
            return WhisperLib.getSystemInfo()
        }
    }
 }
@ -129,9 +117,6 @@ private class WhisperLib {
        external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
        external fun getTextSegmentCount(contextPtr: Long): Int
        external fun getTextSegment(contextPtr: Long, index: Int): String
        external fun getSystemInfo(): String
        external fun benchMemcpy(nthread: Int): String
        external fun benchGgmlMulMat(nthread: Int): String
    }
 }
--- a/examples/whisper.android/app/src/main/jni/whisper/jni.c
+++ b/examples/whisper.android/app/src/main/jni/whisper/jni.c
@ -6,7 +6,6 @@
 #include <sys/sysinfo.h>
 #include <string.h>
 #include "whisper.h"
 #include "ggml.h"
 #define UNUSED(x) (void)(x)
 #define TAG "JNI"
@ -215,29 +214,3 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
    jstring string = (*env)->NewStringUTF(env, text);
    return string;
 }
 JNIEXPORT jstring JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getSystemInfo(
        JNIEnv *env, jobject thiz
 ) {
    UNUSED(thiz);
    const char *sysinfo = whisper_print_system_info();
    jstring string = (*env)->NewStringUTF(env, sysinfo);
    return string;
 }
 JNIEXPORT jstring JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
                                                                      jint n_threads) {
    UNUSED(thiz);
    const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
    jstring string = (*env)->NewStringUTF(env, bench_ggml_memcpy);
 }
 JNIEXPORT jstring JNICALL
 Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
                                                                          jint n_threads) {
    UNUSED(thiz);
    const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
    jstring string = (*env)->NewStringUTF(env, bench_ggml_mul_mat);
 }
--- a/examples/whisper.objc/README.md
+++ b/examples/whisper.objc/README.md
@ -24,5 +24,3 @@ Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Pha
 This can significantly improve the performance of the transcription:
 <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
 In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
--- a/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
+++ b/examples/whisper.objc/whisper.objc.xcodeproj/project.pbxproj
@ -296,10 +296,6 @@
 				IPHONEOS_DEPLOYMENT_TARGET = 16.0;
 				MTL_ENABLE_DEBUG_INFO = NO;
 				MTL_FAST_MATH = YES;
 				OTHER_CFLAGS = (
 					"-O3",
 					"-DNDEBUG",
 				);
 				SDKROOT = iphoneos;
 				VALIDATE_PRODUCT = YES;
 			};
--- a/examples/whisper.swiftui/README.md
+++ b/examples/whisper.swiftui/README.md
@ -7,9 +7,8 @@ To use:
 2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
 3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
 4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
-5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device.
+5. Select the "release" build configuration under "Run", then deploy and run to your device.
 [^1]: I recommend the tiny, base or small models for running on an iOS device.
 [^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
 ![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)
--- a/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
+++ b/examples/whisper.swiftui/whisper.swiftui.xcodeproj/project.pbxproj
@ -430,10 +430,6 @@
 				LLVM_LTO = YES;
 				MACOSX_DEPLOYMENT_TARGET = 13.0;
 				MARKETING_VERSION = 1.0;
 				OTHER_CFLAGS = (
 					"-O3",
 					"-DNDEBUG",
 				);
 				PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
 				PRODUCT_NAME = "$(TARGET_NAME)";
 				SDKROOT = auto;
--- a/ggml.c
+++ b/ggml.c
--- a/ggml.h
+++ b/ggml.h
@ -198,8 +198,6 @@ struct ggml_object;
 struct ggml_context;
 enum ggml_type {
    GGML_TYPE_Q4_0,
    GGML_TYPE_Q4_1,
    GGML_TYPE_I8,
    GGML_TYPE_I16,
    GGML_TYPE_I32,
@ -228,9 +226,7 @@ enum ggml_op {
    GGML_OP_STEP,
    GGML_OP_RELU,
    GGML_OP_GELU,
    GGML_OP_SILU,
    GGML_OP_NORM, // normalize
    GGML_OP_RMS_NORM,
    GGML_OP_MUL_MAT,
@ -330,10 +326,7 @@ void ggml_print_objects(const struct ggml_context * ctx);
 int    ggml_nelements(const struct ggml_tensor * tensor);
 size_t ggml_nbytes   (const struct ggml_tensor * tensor);
-int    ggml_blck_size (enum ggml_type type);
+size_t ggml_type_size   (enum ggml_type type);
 size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
 float  ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
 size_t ggml_element_size(const struct ggml_tensor * tensor);
 struct ggml_context * ggml_init(struct ggml_init_params params);
@ -343,9 +336,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
 size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
 bool ggml_mlock_supported(void);
 bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
 struct ggml_tensor * ggml_new_tensor(
        struct ggml_context * ctx,
        enum   ggml_type type,
@ -476,20 +466,12 @@ struct ggml_tensor * ggml_gelu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 struct ggml_tensor * ggml_silu(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 // normalize along rows
 // TODO: eps is hardcoded to 1e-5 for now
 struct ggml_tensor * ggml_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 struct ggml_tensor * ggml_rms_norm(
        struct ggml_context * ctx,
        struct ggml_tensor  * a);
 // A: m rows, n columns
 // B: p rows, n columns (i.e. we transpose it internally)
 // result is m columns, p rows
@ -744,13 +726,6 @@ enum ggml_opt_result ggml_opt(
        struct ggml_opt_params params,
        struct ggml_tensor * f);
 //
 // quantization
 //
 size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
 size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
 //
 // system info
 //
--- a/models/README.md
+++ b/models/README.md
@ -6,7 +6,7 @@ using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either
 the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
 script to download the already converted models. Currently, they are hosted on the following locations:
- https://huggingface.co/ggerganov/whisper.cpp
+- https://huggingface.co/datasets/ggerganov/whisper.cpp
 - https://ggml.ggerganov.com
 Sample usage:
@ -23,7 +23,7 @@ You can now use it like this:
 A third option to obtain the model files is to download them from Hugging Face:
-https://huggingface.co/ggerganov/whisper.cpp/tree/main
+https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
 ## Available models
--- a/models/convert-h5-to-ggml.py
+++ b/models/convert-h5-to-ggml.py
@ -79,11 +79,11 @@ dir_model   = sys.argv[1]
 dir_whisper = sys.argv[2]
 dir_out     = sys.argv[3]
-with open(dir_model + "/vocab.json", "r", encoding="utf8") as f:
+with open(dir_model + "/vocab.json", "r") as f:
    encoder = json.load(f)
-with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f:
+with open(dir_model + "/added_tokens.json", "r") as f:
    encoder_added = json.load(f)
-with open(dir_model + "/config.json", "r", encoding="utf8") as f:
+with open(dir_model + "/config.json", "r") as f:
    hparams = json.load(f)
 model = WhisperForConditionalGeneration.from_pretrained(dir_model)
--- a/models/download-coreml-model.sh
+++ b/models/download-coreml-model.sh
@ -1,82 +0,0 @@
 #!/bin/bash
 # This script downloads Whisper model files that have already been converted to Core ML format.
 # This way you don't have to convert them yourself.
 src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
 pfx="resolve/main/ggml"
 # get the path of this script
 function get_script_path() {
    if [ -x "$(command -v realpath)" ]; then
        echo "$(dirname $(realpath $0))"
    else
        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
        echo "$ret"
    fi
 }
 models_path="$(get_script_path)"
 # Whisper models
 models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
 # list available models
 function list_models {
    printf "\n"
    printf "  Available models:"
    for model in "${models[@]}"; do
        printf " $model"
    done
    printf "\n\n"
 }
 if [ "$#" -ne 1 ]; then
    printf "Usage: $0 <model>\n"
    list_models
    exit 1
 fi
 model=$1
 if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
    printf "Invalid model: $model\n"
    list_models
    exit 1
 fi
 # download Core ML model
 printf "Downloading Core ML model $model from '$src' ...\n"
 cd $models_path
 if [ -f "ggml-$model.mlmodel" ]; then
    printf "Model $model already exists. Skipping download.\n"
    exit 0
 fi
 if [ -x "$(command -v wget)" ]; then
    wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
 elif [ -x "$(command -v curl)" ]; then
    curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
 else
    printf "Either wget or curl is required to download models.\n"
    exit 1
 fi
 if [ $? -ne 0 ]; then
    printf "Failed to download Core ML model $model \n"
    printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
    exit 1
 fi
 printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
 printf "Run the following command to compile it:\n\n"
 printf "  $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
 printf "You can now use it like this:\n\n"
 printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
 printf "\n"
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@ -40,7 +40,7 @@ if exist "ggml-%model%.bin" (
  goto :eof
 )
-PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
+PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
 if %ERRORLEVEL% neq 0 (
  echo Failed to download ggml model %model%
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@ -6,7 +6,7 @@
 #src="https://ggml.ggerganov.com"
 #pfx="ggml-model-whisper"
-src="https://huggingface.co/ggerganov/whisper.cpp"
+src="https://huggingface.co/datasets/ggerganov/whisper.cpp"
 pfx="resolve/main/ggml"
 # get the path of this script
--- a/whisper.cpp
+++ b/whisper.cpp
@ -1,8 +1,5 @@
 #define WHISPER_BUILD
 #include "whisper.h"
 #if WHISPER_USE_COREML
 #include "coreml/whisper-encoder.h"
 #endif
 #include "ggml.h"
@ -589,10 +586,6 @@ struct whisper_state {
    int lang_id = 0; // english by default
 #ifdef WHISPER_USE_COREML
    whisper_coreml_context * ctx_coreml;
 #endif
    // [EXPERIMENTAL] token-level timestamps data
    int64_t t_beg = 0;
    int64_t t_last = 0;
@ -638,13 +631,12 @@ struct whisper_context {
    int64_t t_load_us = 0;
    int64_t t_start_us = 0;
    ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16)
    whisper_model model;
    whisper_vocab vocab;
    whisper_state * state = nullptr;
    std::string path_model; // populated by whisper_init_from_file()
 };
 template<typename T>
@ -1375,7 +1367,6 @@ static bool whisper_encode_internal(
        }
    }
 #ifndef WHISPER_USE_COREML
    struct ggml_tensor * cur;
    // convolution + gelu
@ -1607,7 +1598,7 @@ static bool whisper_encode_internal(
                        ggml_repeat(ctx0, layer.mlp_ln_w, cur),
                        cur),
                    ggml_repeat(ctx0, layer.mlp_ln_b, cur));
-            }
+    }
 #ifdef WHISPER_USE_FLASH_FF
            wstate.use_buf(ctx0, 0);
@ -1647,7 +1638,7 @@ static bool whisper_encode_internal(
                ggml_repeat(ctx0, layer.mlp_1_b, cur),
                cur);
 #endif
-        }
+}
        wstate.use_buf(ctx0, 3);
@ -1684,13 +1675,6 @@ static bool whisper_encode_internal(
        //ggml_graph_print(&gf);
    }
 #else
    wstate.use_buf(ctx0, -1);
    struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
    whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
 #endif
    // cur
    //{
@ -1858,6 +1842,8 @@ static bool whisper_decode_internal(
        // self-attention
        {
            wstate.use_buf(ctx0, 1);
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
                    layer.attn_q_w,
                    cur);
@ -1919,6 +1905,8 @@ static bool whisper_decode_internal(
            // K * Q
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
            wstate.use_buf(ctx0, 0);
            //struct ggml_tensor * KQ_scaled =
            //    ggml_scale(ctx0,
            //            KQ,
@ -1927,16 +1915,20 @@ static bool whisper_decode_internal(
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ, n_past);
            wstate.use_buf(ctx0, 1);
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
            wstate.use_buf(ctx0, 0);
            struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
+                ggml_permute(ctx0,
-                        ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
-                            ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
-                                ggml_view_1d(ctx0, kv_self.v, (n_past + N)*n_state, il*n_ctx*ggml_element_size(kv_self.v)*n_state),
+                            n_state/n_head, n_head, n_past + N),
-                                n_state/n_head, n_head, n_past + N),
+                        1, 2, 0, 3);
-                            1, 2, 0, 3),
+
-                        ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_state/n_head, n_head));
+            wstate.use_buf(ctx0, 1);
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
@ -1973,6 +1965,8 @@ static bool whisper_decode_internal(
            cur = ggml_norm(ctx0, inpCA); // note: we use inpCA here
            wstate.use_buf(ctx0, 1);
            // cur = ln_0_w*cur + ln_0_b
            cur = ggml_add(ctx0,
                    ggml_mul(ctx0,
@ -1983,6 +1977,8 @@ static bool whisper_decode_internal(
        // cross-attention
        {
            wstate.use_buf(ctx0, 0);
            struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
                    layer.cross_attn_q_w,
                    cur);
@ -2006,13 +2002,12 @@ static bool whisper_decode_internal(
                        ggml_view_1d(ctx0, wstate.kv_cross.v, M*n_state, il*M*ggml_element_size(wstate.kv_cross.v)*n_state),
                        n_state/n_head, n_head, M);
-            struct ggml_tensor * V_trans =
+            struct ggml_tensor * V_trans = ggml_permute(ctx0, Vcross, 1, 2, 0, 3);
                ggml_cpy(ctx0,
                        ggml_permute(ctx0, Vcross, 1, 2, 0, 3),
                        ggml_new_tensor_3d(ctx0, Vcross->type, M, n_state/n_head, n_head));
            // ------
            wstate.use_buf(ctx0, 1);
            struct ggml_tensor * Q =
                ggml_permute(ctx0,
                        ggml_cpy(ctx0,
@ -2022,6 +2017,8 @@ static bool whisper_decode_internal(
            struct ggml_tensor * K = ggml_permute(ctx0, Kcross, 0, 2, 1, 3);
            wstate.use_buf(ctx0, 0);
            // K * Q
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
@ -2034,10 +2031,16 @@ static bool whisper_decode_internal(
            // no masking for cross-attention
            //struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
            wstate.use_buf(ctx0, 1);
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ);
            wstate.use_buf(ctx0, 0);
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
            wstate.use_buf(ctx0, 1);
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
            // cur = KQV_merged.contiguous().view(n_state, N)
@ -2475,25 +2478,12 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
 // interface implementation
 //
 #ifdef WHISPER_USE_COREML
 // replace .bin with .mlmodelc
 static std::string whisper_get_coreml_path(std::string path_bin) {
    auto pos = path_bin.rfind('.');
    if (pos != std::string::npos) {
        path_bin = path_bin.substr(0, pos);
    }
    path_bin += ".mlmodelc";
    return path_bin;
 }
 #endif
 struct whisper_state * whisper_init_state(whisper_context * ctx) {
    whisper_state * state = new whisper_state;
    const size_t scale = ctx->model.hparams.f16 ? 1 : 2;
    if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
        fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
        return nullptr;
@ -2514,6 +2504,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
        fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
    }
    state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
    state->logits_id.reserve(ctx->model.hparams.n_vocab);
@ -2533,21 +2524,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
    state->rng = std::mt19937(0);
 #ifdef WHISPER_USE_COREML
    const auto path_coreml = whisper_get_coreml_path(ctx->path_model);
    fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
    fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
    state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
    if (!state->ctx_coreml) {
        fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
        return nullptr;
    }
    fprintf(stderr, "%s: Core ML model loaded\n", __func__);
 #endif
    return state;
 }
@ -2563,7 +2539,6 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
    }
    loader.context = &fin;
    loader.read = [](void * ctx, void * output, size_t read_size) {
        std::ifstream * fin = (std::ifstream*)ctx;
        fin->read((char *)output, read_size);
@ -2580,13 +2555,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
        fin->close();
    };
-    auto ctx = whisper_init_no_state(&loader);
+    return whisper_init_no_state(&loader);
    if (ctx) {
        ctx->path_model = path_model;
    }
    return ctx;
 }
 struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) {
@ -2711,10 +2680,6 @@ void whisper_free(struct whisper_context * ctx) {
        whisper_free_state(ctx->state);
 #ifdef WHISPER_USE_COREML
        whisper_coreml_free(ctx->state->ctx_coreml);
        ctx->state->ctx_coreml = nullptr;
 #endif
        delete ctx;
    }
 }
@ -2891,7 +2856,7 @@ int whisper_lang_auto_detect_with_state(
    }
    // run the encoder
-    if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
+    if (whisper_encode(ctx, seek, n_threads) != 0) {
        fprintf(stderr, "%s: failed to encode\n", __func__);
        return -6;
    }
@ -2955,71 +2920,6 @@ int whisper_lang_auto_detect(
    return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
 }
 int whisper_model_n_vocab(struct whisper_context * ctx) {
    return ctx->model.hparams.n_vocab;
 }
 int whisper_model_n_audio_ctx(struct whisper_context * ctx) {
    return ctx->model.hparams.n_audio_ctx;
 }
 int whisper_model_n_audio_state(struct whisper_context * ctx) {
    return ctx->model.hparams.n_audio_state;
 }
 int whisper_model_n_audio_head(struct whisper_context * ctx) {
    return ctx->model.hparams.n_audio_head;
 }
 int whisper_model_n_audio_layer(struct whisper_context * ctx) {
    return ctx->model.hparams.n_audio_layer;
 }
 int whisper_model_n_text_ctx(struct whisper_context * ctx) {
    return ctx->model.hparams.n_text_ctx;
 }
 int whisper_model_n_text_state(struct whisper_context * ctx) {
    return ctx->model.hparams.n_text_state;
 }
 int whisper_model_n_text_head(struct whisper_context * ctx) {
    return ctx->model.hparams.n_text_head;
 }
 int whisper_model_n_text_layer(struct whisper_context * ctx) {
    return ctx->model.hparams.n_text_layer;
 }
 int whisper_model_n_mels(struct whisper_context * ctx) {
    return ctx->model.hparams.n_mels;
 }
 int whisper_model_f16(struct whisper_context * ctx) {
    return ctx->model.hparams.f16;
 }
 int whisper_model_type(struct whisper_context * ctx) {
    return ctx->model.type;
 }
 const char *whisper_model_type_readable(struct whisper_context * ctx) {
    switch (ctx->model.type) {
    case e_model::MODEL_TINY:
        return "tiny";
    case e_model::MODEL_BASE:
        return "base";
    case e_model::MODEL_SMALL:
        return "small";
    case e_model::MODEL_MEDIUM:
        return "medium";
    case e_model::MODEL_LARGE:
        return "large";
    default:
        return "unknown";
    }
 }
 int whisper_n_len_from_state(struct whisper_state * state) {
    return state->mel.n_len;
 }
@ -3182,7 +3082,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.max_initial_ts   =*/  1.0f,
        /*.length_penalty   =*/ -1.0f,
-        /*.temperature_inc  =*/  0.0f, // TODO: temporary disabled until improve performance
+        /*.temperature_inc  =*/  0.2f,
        /*.entropy_thold    =*/  2.4f,
        /*.logprob_thold    =*/ -1.0f,
        /*.no_speech_thold  =*/  0.6f,
@ -4651,15 +4551,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
 //
 WHISPER_API int whisper_bench_memcpy(int n_threads) {
    fputs(whisper_bench_memcpy_str(n_threads), stderr);
    return 0;
 }
 WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
    static std::string s;
    s = "";
    char strbuf[256];
    ggml_time_init();
    size_t n    = 50;
@ -4689,8 +4580,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
        src[0] = rand();
    }
-    snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
+    fprintf(stderr, "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu));
    s += strbuf;
    // needed to prevent the compile from optimizing the memcpy away
    {
@ -4698,26 +4588,16 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
        for (size_t i = 0; i < size; i++) sum += dst[i];
-        snprintf(strbuf, sizeof(strbuf), "sum:    %s %f\n", sum == -536870910.00 ? "ok" : "error", sum);
+        fprintf(stderr, "sum:    %s %f\n", sum == -536870910.00 ? "ok" : "error", sum);
        s += strbuf;
    }
    free(src);
    free(dst);
    return s.c_str();
 }
 WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
    fputs(whisper_bench_ggml_mul_mat_str(n_threads), stderr);
    return 0;
 }
-WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
+WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads) {
    static std::string s;
    s = "";
    char strbuf[256];
    ggml_time_init();
    const int n_max = 128;
@ -4793,12 +4673,11 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
            s = ((2.0*N*N*N*n)/tsum)*1e-9;
        }
-        snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
+        fprintf(stderr, "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
            N, N, s_fp16, n_fp16, s_fp32, n_fp32);
        s += strbuf;
    }
-    return s.c_str();
+    return 0;
 }
 // =================================================================================================
--- a/whisper.h
+++ b/whisper.h
@ -248,19 +248,6 @@ extern "C" {
    WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
    WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
    WHISPER_API int whisper_model_f16          (struct whisper_context * ctx);
    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
    // Token logits obtained from the last call to whisper_decode()
    // The logits for the last token are stored in the last row
    // Rows: n_tokens
@ -270,8 +257,6 @@ extern "C" {
    // Token Id -> String. Uses the vocabulary in the provided context
    WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
    WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
    // Special tokens
    WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
@ -477,9 +462,7 @@ extern "C" {
    // Temporary helpers needed for exposing ggml interface
    WHISPER_API int whisper_bench_memcpy(int n_threads);
    WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
    WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
    WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
 #ifdef __cplusplus
 }