diarization : try conv and self-attention embeddings

diarization : try to cluster embedings from last encoder layer
diarization : more unsuccessful clustering experiments
2023-02-19 13:00:12 +02:00 · 2023-02-19 10:33:03 +02:00 · 2023-02-18 18:36:03 +02:00 · 2023-02-18 12:16:39 +02:00
16 changed files with 592 additions and 697 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,7 +1,5 @@
 *.o
 *.a
 *.mlmodel
 *.mlmodelc
 .cache/
 .vs/
 .vscode/
@ -12,7 +10,6 @@ build-em/
 build-debug/
 build-release/
 build-static/
 build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.0)
-project(whisper.cpp VERSION 1.2.1)
+project(whisper.cpp VERSION 1.2.0)
 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -54,8 +54,6 @@ if (APPLE)
    option(WHISPER_NO_AVX              "whisper: disable AVX" OFF)
    option(WHISPER_NO_AVX2             "whisper: disable AVX2" OFF)
    option(WHISPER_NO_FMA              "whisper: disable FMA" OFF)
    option(WHISPER_COREML              "whisper: enable Core ML framework" OFF)
 else()
    option(WHISPER_SUPPORT_OPENBLAS    "whisper: support for OpenBLAS" OFF)
 endif()
@ -88,33 +86,16 @@ endif()
 find_package(Threads REQUIRED)
-# on APPLE
+# on APPLE - include Accelerate framework
-if (APPLE)
+if (APPLE AND NOT WHISPER_NO_ACCELERATE)
-    # include Accelerate framework
+    find_library(ACCELERATE_FRAMEWORK Accelerate)
-    if (NOT WHISPER_NO_ACCELERATE)
+    if (ACCELERATE_FRAMEWORK)
-        find_library(ACCELERATE_FRAMEWORK Accelerate)
+        message(STATUS "Accelerate framework found")
-        if (ACCELERATE_FRAMEWORK)
+        set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
-            message(STATUS "Accelerate framework found")
+        set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
-
+    else()
-            set(WHISPER_EXTRA_LIBS  ${WHISPER_EXTRA_LIBS}  ${ACCELERATE_FRAMEWORK})
+        message(WARNING "Accelerate framework not found")
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
        else()
            message(WARNING "Accelerate framework not found")
        endif()
    endif()
    if (WHISPER_COREML)
        find_library(FOUNDATION_FRAMEWORK Foundation)
        find_library(COREML_FRAMEWORK CoreML)
        if (COREML_FRAMEWORK)
            message(STATUS "CoreML framework found")
            set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
        else()
            message(WARNING "CoreML framework not found")
        endif()
    endif()
 endif()
@ -200,33 +181,6 @@ if (WHISPER_PERF)
    set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
 endif()
 #
 # whisper.coreml - Core ML support
 #
 if (WHISPER_COREML)
    set(TARGET whisper.coreml)
    add_library(${TARGET}
        coreml/whisper-encoder.h
        coreml/whisper-encoder.mm
        coreml/whisper-encoder-impl.h
        coreml/whisper-encoder-impl.m
        )
    include(DefaultTargetOptions)
    target_include_directories(${TARGET} PUBLIC
        .
        )
    target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
    set_target_properties(${TARGET} PROPERTIES
        COMPILE_FLAGS "-fobjc-arc"
        )
 endif()
 #
 # whisper - this is the main library of the project
 #
@ -246,10 +200,6 @@ target_include_directories(${TARGET} PUBLIC
    .
    )
 if (WHISPER_COREML)
    target_link_libraries(${TARGET} PRIVATE whisper.coreml)
 endif()
 if (MSVC)
    target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
--- a/50
+++ b/50
@ -30,8 +30,8 @@ endif
 # Compile flags
 #
-CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
+CFLAGS   = -I.              -O3 -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
+CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
 LDFLAGS  =
 # OS specific
@ -132,10 +132,6 @@ ifndef WHISPER_NO_ACCELERATE
 		LDFLAGS += -framework Accelerate
 	endif
 endif
 ifdef WHISPER_COREML
 	CXXFLAGS += -DWHISPER_USE_COREML
 	LDFLAGS  += -framework Foundation -framework CoreML
 endif
 ifdef WHISPER_OPENBLAS
 	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
 	LDFLAGS += -lopenblas
@ -145,8 +141,6 @@ ifdef WHISPER_GPROF
 	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
 	CFLAGS += -mcpu=native
 	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, 2, 3
@ -188,23 +182,11 @@ ggml.o: ggml.c ggml.h
 whisper.o: whisper.cpp whisper.h
 	$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
-ifndef WHISPER_COREML
+libwhisper.a: ggml.o whisper.o
-WHISPER_OBJ = whisper.o
+	$(AR) rcs libwhisper.a ggml.o whisper.o
 else
 whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
 	$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
-whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
+libwhisper.so: ggml.o whisper.o
-	$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
+	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
 WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
 endif
 libwhisper.a: ggml.o $(WHISPER_OBJ)
 	$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
 libwhisper.so: ggml.o $(WHISPER_OBJ)
 	$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
 clean:
 	rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
@ -218,21 +200,21 @@ CC_SDL=`sdl2-config --cflags --libs`
 SRC_COMMON = examples/common.cpp
 SRC_COMMON_SDL = examples/common-sdl.cpp
-main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
+main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
 	./main -h
-stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
-command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
-talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
+talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
-bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
+bench: examples/bench/bench.cpp ggml.o whisper.o
-	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
 #
 # Audio samples
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
-Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.2.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -469,9 +469,7 @@ in [models](models).
 - [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
+- [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)
  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
 ## Examples
--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.2.1",
+  "version": "1.2.0",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/coreml/whisper-encoder-impl.h
+++ b/coreml/whisper-encoder-impl.h
@ -1,142 +0,0 @@
 //
 // CoremlEncoder.h
 //
 // This file was automatically generated and should not be edited.
 //
 #import <Foundation/Foundation.h>
 #import <CoreML/CoreML.h>
 #include <stdint.h>
 #include <os/log.h>
 NS_ASSUME_NONNULL_BEGIN
 /// Model Prediction Input Type
 API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoderInput : NSObject<MLFeatureProvider>
 /// melSegment as 1 × 80 × 3000 3-dimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * melSegment;
 - (instancetype)init NS_UNAVAILABLE;
 - (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER;
@end
 /// Model Prediction Output Type
 API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoderOutput : NSObject<MLFeatureProvider>
 /// output as multidimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * output;
 - (instancetype)init NS_UNAVAILABLE;
 - (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
@end
 /// Class for model loading and prediction
 API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoder : NSObject
@property (readonly, nonatomic, nullable) MLModel * model;
 /**
    URL of the underlying .mlmodelc directory.
 */
 + (nullable NSURL *)URLOfModelInThisBundle;
 /**
    Initialize CoremlEncoder instance from an existing MLModel object.
    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
 */
 - (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
 */
 - (nullable instancetype)init;
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Construct CoremlEncoder instance asynchronously with configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
 /**
    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param modelURL The model URL.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
 /**
    Make a prediction using the standard interface
    @param input an instance of CoremlEncoderInput to predict from
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the prediction as CoremlEncoderOutput
 */
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Make a prediction using the standard interface
    @param input an instance of CoremlEncoderInput to predict from
    @param options prediction options
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the prediction as CoremlEncoderOutput
 */
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Make a prediction using the convenience interface
    @param melSegment as 1 × 80 × 3000 3-dimensional array of floats:
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the prediction as CoremlEncoderOutput
 */
 - (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error;
 /**
    Batch prediction
    @param inputArray array of CoremlEncoderInput instances to obtain predictions from
    @param options prediction options
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
    @return the predictions as NSArray<CoremlEncoderOutput *>
 */
 - (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
@end
 NS_ASSUME_NONNULL_END
--- a/coreml/whisper-encoder-impl.m
+++ b/coreml/whisper-encoder-impl.m
@ -1,197 +0,0 @@
 //
 // CoremlEncoder.m
 //
 // This file was automatically generated and should not be edited.
 //
 #if !__has_feature(objc_arc)
 #error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
 #endif
 #import "whisper-encoder-impl.h"
@implementation CoremlEncoderInput
 - (instancetype)initWithMelSegment:(MLMultiArray *)melSegment {
    self = [super init];
    if (self) {
        _melSegment = melSegment;
    }
    return self;
 }
 - (NSSet<NSString *> *)featureNames {
    return [NSSet setWithArray:@[@"melSegment"]];
 }
 - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
    if ([featureName isEqualToString:@"melSegment"]) {
        return [MLFeatureValue featureValueWithMultiArray:self.melSegment];
    }
    return nil;
 }
@end
@implementation CoremlEncoderOutput
 - (instancetype)initWithOutput:(MLMultiArray *)output {
    self = [super init];
    if (self) {
        _output = output;
    }
    return self;
 }
 - (NSSet<NSString *> *)featureNames {
    return [NSSet setWithArray:@[@"output"]];
 }
 - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
    if ([featureName isEqualToString:@"output"]) {
        return [MLFeatureValue featureValueWithMultiArray:self.output];
    }
    return nil;
 }
@end
@implementation CoremlEncoder
 /**
    URL of the underlying .mlmodelc directory.
 */
 + (nullable NSURL *)URLOfModelInThisBundle {
    NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"];
    if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; }
    return [NSURL fileURLWithPath:assetPath];
 }
 /**
    Initialize CoremlEncoder instance from an existing MLModel object.
    Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
    Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
 */
 - (instancetype)initWithMLModel:(MLModel *)model {
    self = [super init];
    if (!self) { return nil; }
    _model = model;
    if (_model == nil) { return nil; }
    return self;
 }
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
 */
 - (nullable instancetype)init {
    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
 }
 /**
    Initialize CoremlEncoder instance with the model in this bundle.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
 }
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
    if (model == nil) { return nil; }
    return [self initWithMLModel:model];
 }
 /**
    Initialize CoremlEncoder instance from the model URL.
    @param modelURL URL to the .mlmodelc directory for CoremlEncoder.
    @param configuration The model configuration object
    @param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
 */
 - (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
    if (model == nil) { return nil; }
    return [self initWithMLModel:model];
 }
 /**
    Construct CoremlEncoder instance asynchronously with configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
    [self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
              configuration:configuration
          completionHandler:handler];
 }
 /**
    Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
    Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
    @param modelURL The model URL.
    @param configuration The model configuration
    @param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
 */
 + (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
    [MLModel loadContentsOfURL:modelURL
                 configuration:configuration
             completionHandler:^(MLModel *model, NSError *error) {
        if (model != nil) {
            CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model];
            handler(typedModel, nil);
        } else {
            handler(nil, error);
        }
    }];
 }
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
 }
 - (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
    if (!outFeatures) { return nil; }
    return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
 }
 - (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment];
    return [self predictionFromFeatures:input_ error:error];
 }
 - (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
    id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
    id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
    if (!outBatch) { return nil; }
    NSMutableArray<CoremlEncoderOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
    for (NSInteger i = 0; i < outBatch.count; i++) {
        id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
        CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
        [results addObject:result];
    }
    return results;
 }
@end
--- a/coreml/whisper-encoder.h
+++ b/coreml/whisper-encoder.h
@ -1,22 +0,0 @@
 // Wrapper of the Core ML Whisper Encoder model
 //
 // Code is derived from the work of Github user @wangchou
 // ref: https://github.com/wangchou/callCoreMLFromCpp
 #if __cplusplus
 extern "C" {
 #endif
 struct whisper_coreml_context;
 struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
 void whisper_coreml_free(struct whisper_coreml_context * ctx);
 void whisper_coreml_encode(
        const whisper_coreml_context * ctx,
                               float * mel,
                               float * out);
 #if __cplusplus
 }
 #endif
--- a/coreml/whisper-encoder.mm
+++ b/coreml/whisper-encoder.mm
@ -1,61 +0,0 @@
 #import "coreml/whisper-encoder.h"
 #import "coreml/whisper-encoder-impl.h"
 #import <CoreML/CoreML.h>
 #include <stdlib.h>
 #if __cplusplus
 extern "C" {
 #endif
 struct whisper_coreml_context {
    const void * data;
 };
 struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
    NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
    NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
    const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]);
    if (data == NULL) {
        return NULL;
    }
    whisper_coreml_context * ctx = new whisper_coreml_context;
    ctx->data = data;
    return ctx;
 }
 void whisper_coreml_free(struct whisper_coreml_context * ctx) {
    CFRelease(ctx->data);
    delete ctx;
 }
 void whisper_coreml_encode(
        const whisper_coreml_context * ctx,
                               float * mel,
                               float * out) {
    MLMultiArray * inMultiArray = [
        [MLMultiArray alloc] initWithDataPointer: mel
                                           shape: @[@1, @80, @3000]
                                        dataType: MLMultiArrayDataTypeFloat32
                                         strides: @[@(240000), @(3000), @1]
                                     deallocator: nil
                                           error: nil
    ];
    CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil];
    MLMultiArray * outMA = outCoreML.output;
    memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
 }
 #if __cplusplus
 }
 #endif
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -352,14 +352,13 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
    const int n_segments = whisper_full_n_segments(ctx);
    fout << "start,end,text\n";
    for (int i = 0; i < n_segments; ++i) {
        const char * text = whisper_full_get_segment_text(ctx, i);
        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
        //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
-        fout << 10 * t0 << "," << 10 * t1 << ",\"" << text    << "\"\n";
+        fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text    << "\"\n";
    }
    return true;
@ -619,6 +618,8 @@ int main(int argc, char ** argv) {
                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
                return 10;
            }
            whisper_full_cluster_segments(ctx);
        }
        // output stuff
--- a/ggml.c
+++ b/ggml.c
@ -8517,6 +8517,195 @@ enum ggml_opt_result ggml_opt(
 ////////////////////////////////////////////////////////////////////////////////
 void ggml_svd_reduce_dims(
        int ne0,
        int ne1,
        float * a,
        int nd) {
    int n = ne1;
    int m = ne0;
    float * A = a;
    float * A0 = (float *) malloc(n * m * sizeof(float));
    // average vector
    //float * M = (float *) malloc(m * sizeof(float));
    //{
    //    for (int j = 0; j < m; ++j) {
    //        M[j] = 0.0f;
    //    }
    //    for (int i = 0; i < n; ++i) {
    //        for (int j = 0; j < m; ++j) {
    //            M[j] += A[i * m + j];
    //        }
    //    }
    //    for (int j = 0; j < m; ++j) {
    //        M[j] /= (float) n;
    //    }
    //}
    //// subtract average vector
    //for (int i = 0; i < n; ++i) {
    //    for (int j = 0; j < m; ++j) {
    //        A[i * m + j] -= M[j];
    //    }
    //}
    //free(M);
    memcpy(A0, A, n * m * sizeof(float));
    // print A
    //printf("A:\n");
    //for (int i = 0; i < n; ++i) {
    //    printf("col %d : ", i);
    //    for (int j = 0; j < m; ++j) {
    //        printf("%9.5f ", A[i * m + j]);
    //    }
    //    printf("\n");
    //}
    //printf("\n");
    // SVD
    // A = U * S * V^T
    float * U = (float *) malloc(n * m * sizeof(float));
    float * S = (float *) malloc(n * sizeof(float));
    float * V = (float *) malloc(n * n * sizeof(float));
    int lda = m;
    int ldu = m;
    int ldvt = n;
    float work_size;
    int lwork = -1;
    int info = 0;
    sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, &work_size, &lwork, &info);
    lwork = (int) work_size;
    //printf("work_size = %f, info = %d, lwork = %d\n", work_size, info, lwork);
    float * work = (float *) malloc(lwork * sizeof(float));
    sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, work, &lwork, &info);
    free(work);
    // print U
    //printf("U:\n");
    //for (int i = 0; i < n; ++i) {
    //    printf("col %d : ", i);
    //    for (int j = 0; j < m; ++j) {
    //        printf("%9.5f ", U[i * m + j]);
    //    }
    //    printf("\n");
    //}
    //printf("\n");
    // normalize S
    {
        double sum = 0.0;
        for (int i = 0; i < n; ++i) {
            sum += S[i];
        }
        sum *= sqrt((double) m);
        for (int i = 0; i < n; ++i) {
            S[i] /= sum;
        }
    }
    // print S
    printf("S:\n");
    for (int i = 0; i < n; ++i) {
        printf("- %d = %9.5f\n", i, S[i]);
    }
    printf("\n");
    // print V
    //printf("V:\n");
    //for (int i = 0; i < n; ++i) {
    //    printf("col %d : ", i);
    //    for (int j = 0; j < n; ++j) {
    //        printf("%9.5f ", V[i * n + j]);
    //    }
    //    printf("\n");
    //}
    //printf("\n");
    // print A
    //printf("A:\n");
    //for (int i = 0; i < n; ++i) {
    //    printf("col %d : ", i);
    //    for (int j = 0; j < m; ++j) {
    //        printf("%9.5f ", A[i * m + j]);
    //    }
    //    printf("\n");
    //}
    //printf("\n");
    // compute singular vectors in U
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < m; ++j) {
            U[i * m + j] *= S[i];
        }
    }
    // normalize U
    for (int i = 0; i < n; ++i) {
        double sum = 0.0;
        for (int j = 0; j < m; ++j) {
            sum += U[i * m + j] * U[i * m + j];
        }
        sum = sqrt(sum);
        for (int j = 0; j < m; ++j) {
            U[i * m + j] /= sum*sqrt((double) m);
        }
    }
    // print U
    //printf("U:\n");
    //for (int i = 0; i < n; ++i) {
    //    printf("col %d : ", i);
    //    for (int j = 0; j < m; ++j) {
    //        printf("%9.5f ", U[i * m + j]);
    //    }
    //    printf("\n");
    //}
    //printf("\n");
    // project A0 onto U
    for (int i = 0; i < n; ++i) {
        for (int j = 0; j < nd; ++j) {
            A[i * nd + j] = 0.0f;
            //if (j == 0) continue;
            for (int k = 0; k < m; ++k) {
                A[i * nd + j] += A0[i * m + k] * U[j * m + k];
            }
        }
    }
    // print A
    //printf("A:\n");
    //for (int i = 0; i < n; ++i) {
    //    printf("col %d : ", i);
    //    for (int j = 0; j < n; ++j) {
    //        printf("%9.5f ", A[i * n + j]);
    //    }
    //    printf("\n");
    //}
    //printf("\n");
    free(U);
    free(S);
    free(V);
    free(A0);
 }
 ////////////////////////////////////////////////////////////////////////////////
 int ggml_cpu_has_avx(void) {
 #if defined(__AVX__)
    return 1;
--- a/ggml.h
+++ b/ggml.h
@ -726,6 +726,16 @@ enum ggml_opt_result ggml_opt(
        struct ggml_opt_params params,
        struct ggml_tensor * f);
 //
 // Temp stuff
 //
 void ggml_svd_reduce_dims(
        int ne0,
        int ne1,
        float * a,
        int nd);
 //
 // system info
 //
--- a/models/download-coreml-model.sh
+++ b/models/download-coreml-model.sh
@ -1,82 +0,0 @@
 #!/bin/bash
 # This script downloads Whisper model files that have already been converted to Core ML format.
 # This way you don't have to convert them yourself.
 src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
 pfx="resolve/main/ggml"
 # get the path of this script
 function get_script_path() {
    if [ -x "$(command -v realpath)" ]; then
        echo "$(dirname $(realpath $0))"
    else
        local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
        echo "$ret"
    fi
 }
 models_path="$(get_script_path)"
 # Whisper models
 models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
 # list available models
 function list_models {
    printf "\n"
    printf "  Available models:"
    for model in "${models[@]}"; do
        printf " $model"
    done
    printf "\n\n"
 }
 if [ "$#" -ne 1 ]; then
    printf "Usage: $0 <model>\n"
    list_models
    exit 1
 fi
 model=$1
 if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
    printf "Invalid model: $model\n"
    list_models
    exit 1
 fi
 # download Core ML model
 printf "Downloading Core ML model $model from '$src' ...\n"
 cd $models_path
 if [ -f "ggml-$model.mlmodel" ]; then
    printf "Model $model already exists. Skipping download.\n"
    exit 0
 fi
 if [ -x "$(command -v wget)" ]; then
    wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
 elif [ -x "$(command -v curl)" ]; then
    curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
 else
    printf "Either wget or curl is required to download models.\n"
    exit 1
 fi
 if [ $? -ne 0 ]; then
    printf "Failed to download Core ML model $model \n"
    printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
    exit 1
 fi
 printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
 printf "Run the following command to compile it:\n\n"
 printf "  $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
 printf "You can now use it like this:\n\n"
 printf "  $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
 printf "\n"
--- a/whisper.cpp
+++ b/whisper.cpp
@ -1,8 +1,5 @@
 #define WHISPER_BUILD
 #include "whisper.h"
 #if WHISPER_USE_COREML
 #include "coreml/whisper-encoder.h"
 #endif
 #include "ggml.h"
@ -271,6 +268,14 @@ static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
    { MODEL_LARGE,    71ull*MB },
 };
 static const std::map<e_model, size_t> MEM_REQ_KV_ENC_SELF = {
    { MODEL_TINY,     23ull*MB },
    { MODEL_BASE,     26ull*MB },
    { MODEL_SMALL,   216ull*MB },
    { MODEL_MEDIUM,  243ull*MB },
    { MODEL_LARGE,   271ull*MB },
 };
 static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
    { MODEL_TINY,      9ull*MB },
    { MODEL_BASE,     18ull*MB },
@ -574,6 +579,7 @@ struct whisper_context {
    // cross-attention KV cache for the decoders
    // shared between all decoders
    whisper_kv_cache kv_cross;
    whisper_kv_cache kv_enc_self;
    whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
@ -595,21 +601,18 @@ struct whisper_context {
    mutable std::mt19937 rng; // used for sampling at t > 0.0
-    int lang_id = 0; // english by default
+    int lang_id;
    std::string path_model; // populated by whisper_init_from_file()
 #ifdef WHISPER_USE_COREML
    whisper_coreml_context * ctx_coreml;
 #endif
    // [EXPERIMENTAL] token-level timestamps data
-    int64_t t_beg = 0;
+    int64_t t_beg;
-    int64_t t_last = 0;
+    int64_t t_last;
    whisper_token tid_last;
    std::vector<float> energy; // PCM signal energy
    // [EXPERIMENTAL] speed-up techniques
-    int32_t exp_n_audio_ctx = 0; // 0 - use default
+    int32_t exp_n_audio_ctx; // 0 - use default
    std::vector<float> audio_embd;
    void use_buf(struct ggml_context * ctx, int i) {
 #if defined(WHISPER_USE_SCRATCH)
@ -844,6 +847,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
            return false;
        }
        if (!kv_cache_init(model.hparams, scale*MEM_REQ_KV_ENC_SELF.at(model.type), wctx.kv_enc_self, wctx.wtype, model.hparams.n_audio_ctx)) {
            fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
            return false;
        }
        {
            const size_t memory_size = ggml_nbytes(wctx.kv_cross.k) + ggml_nbytes(wctx.kv_cross.v);
            fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
@ -1366,7 +1374,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
 static bool whisper_encode(
        whisper_context & wctx,
              const int   mel_offset,
-              const int   n_threads) {
+              const int   n_threads,
              bool repeat = false) {
    const int64_t t_start_us = ggml_time_us();
    const auto & model   = wctx.model;
@ -1398,14 +1407,31 @@ static bool whisper_encode(
        const int i0 = std::min(mel_offset, mel_inp.n_len);
        const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
-        for (int j = 0; j < mel_inp.n_mel; ++j) {
+        if (repeat == false) {
-            for (int i = i0; i < i1; ++i) {
+            for (int j = 0; j < mel_inp.n_mel; ++j) {
-                dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
+                for (int i = i0; i < i1; ++i) {
                    dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
                }
            }
        } else {
            for (int j = 0; j < mel_inp.n_mel; ++j) {
                int k = 0;
                while (k < 2*n_ctx) {
                    for (int i = i0; i < i1; ++i) {
                        dst[j*2*n_ctx + k] = mel_inp.data[j*mel_inp.n_len + i];
                        k++;
                        if (k >= 2*n_ctx) {
                            break;
                        }
                    }
                }
            }
        }
    }
-#ifndef WHISPER_USE_COREML
+    struct ggml_cgraph gf = {};
    gf.n_threads = n_threads;
    struct ggml_tensor * cur;
    // convolution + gelu
@ -1433,6 +1459,18 @@ static bool whisper_encode(
        cur = ggml_gelu(ctx0, cur);
    }
    //{
    //    //printf("cur: %d %d %d %d, size element = %d\n", cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_element_size(cur));
    //    wctx.use_buf(ctx0, -1);
    //    struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_enc_self.k, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.k)*n_state)*(0*n_ctx));
    //    //struct ggml_tensor * v = ggml_view_1d(ctx0, wctx.kv_enc_self.v, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.v)*n_state)*(il*n_ctx));
    //    ggml_build_forward_expand(&gf, ggml_cpy(ctx0, cur, k));
    //    //ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
    //}
    wctx.use_buf(ctx0, 3);
    // ===================================================================
@ -1513,6 +1551,18 @@ static bool whisper_encode(
                        Vcur),
                    Vcur);
            //{
            //    //printf("Kcur: %d %d %d %d, size element = %d\n", Kcur->ne[0], Kcur->ne[1], Kcur->ne[2], Kcur->ne[3], ggml_element_size(Kcur));
            //    wctx.use_buf(ctx0, -1);
            //    struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_enc_self.k, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.k)*n_state)*(il*n_ctx));
            //    struct ggml_tensor * v = ggml_view_1d(ctx0, wctx.kv_enc_self.v, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.v)*n_state)*(il*n_ctx));
            //    ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
            //    ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
            //}
            // ------
            wctx.use_buf(ctx0, 0);
@ -1597,6 +1647,18 @@ static bool whisper_encode(
            cur = ggml_cpy(ctx0,
                    KQV_merged,
                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
            {
                //printf("cur: %d %d %d %d, size element = %d\n", cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_element_size(cur));
                wctx.use_buf(ctx0, -1);
                struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_enc_self.k, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.k)*n_state)*(il*n_ctx));
                //struct ggml_tensor * v = ggml_view_1d(ctx0, wctx.kv_enc_self.v, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.v)*n_state)*(il*n_ctx));
                ggml_build_forward_expand(&gf, ggml_cpy(ctx0, cur, k));
                //ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
            }
        }
        // projection
@ -1706,21 +1768,12 @@ static bool whisper_encode(
    // run the computation
    {
        struct ggml_cgraph gf = {};
        gf.n_threads = n_threads;
        ggml_build_forward_expand(&gf, cur);
        ggml_graph_compute       (ctx0, &gf);
        //ggml_graph_print(&gf);
    }
 #else
    wctx.use_buf(ctx0, -1);
    struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
    whisper_coreml_encode(wctx.ctx_coreml, (float *) mel->data, (float *) cur->data);
 #endif
    // cur
    //{
@ -1736,6 +1789,24 @@ static bool whisper_encode(
    //    printf("\n");
    //}
    {
        //const int i0 = std::min(mel_offset, mel_inp.n_len);
        //const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
        const int i0 = 0;
        const int i1 = cur->ne[1];
        //printf("i0 = %d, i1 = %d, (i1 - i0) = %d, embd size = %d\n", i0, i1, i1 - i0, cur->ne[0]);
        wctx.audio_embd.clear();
        wctx.audio_embd.resize(cur->ne[0], 0.0f);
        for (int j = 0; j < cur->ne[0]; ++j) {
            for (int i = i0; i < i1; ++i) {
                wctx.audio_embd[j] += ((float *)(cur->data))[(i - i0)*cur->ne[0] + j];
            }
            wctx.audio_embd[j] /= (i1 - i0);
        }
    }
    // pre-compute cross-attention memory
    {
        struct ggml_cgraph gf = {};
@ -2523,20 +2594,6 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
 // interface implementation
 //
 #ifdef WHISPER_USE_COREML
 // replace .bin with .mlmodelc
 static std::string whisper_get_coreml_path(std::string path_bin) {
    auto pos = path_bin.rfind('.');
    if (pos != std::string::npos) {
        path_bin = path_bin.substr(0, pos);
    }
    path_bin += ".mlmodelc";
    return path_bin;
 }
 #endif
 struct whisper_context * whisper_init_from_file(const char * path_model) {
    whisper_model_loader loader = {};
@ -2549,7 +2606,6 @@ struct whisper_context * whisper_init_from_file(const char * path_model) {
    }
    loader.context = &fin;
    loader.read = [](void * ctx, void * output, size_t read_size) {
        std::ifstream * fin = (std::ifstream*)ctx;
        fin->read((char *)output, read_size);
@ -2566,26 +2622,7 @@ struct whisper_context * whisper_init_from_file(const char * path_model) {
        fin->close();
    };
-    auto ctx = whisper_init(&loader);
+    return whisper_init(&loader);
    if (ctx) {
        ctx->path_model = path_model;
 #ifdef WHISPER_USE_COREML
        const auto path_coreml = whisper_get_coreml_path(ctx->path_model);
        fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
        fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
        ctx->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
        if (!ctx->ctx_coreml) {
            fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
            return nullptr;
        }
        fprintf(stderr, "%s: Core ML model loaded\n", __func__);
 #endif
    }
    return ctx;
 }
 struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) {
@ -2657,10 +2694,6 @@ void whisper_free(struct whisper_context * ctx) {
                ggml_free(ctx->decoders[i].kv_self.ctx);
            }
        }
 #ifdef WHISPER_USE_COREML
        whisper_coreml_free(ctx->ctx_coreml);
        ctx->ctx_coreml = nullptr;
 #endif
        delete ctx;
    }
 }
@ -3016,9 +3049,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.encoder_begin_callback           =*/ nullptr,
        /*.encoder_begin_callback_user_data =*/ nullptr,
        /*.logits_filter_callback           =*/ nullptr,
        /*.logits_filter_callback_user_data =*/ nullptr,
    };
    switch (strategy) {
@ -3146,7 +3176,7 @@ static const std::vector<std::string> non_speech_tokens = {
 // - applies logit filters
 // - computes logprobs and probs
 static void whisper_process_logits(
-              struct whisper_context & ctx,
+        const struct whisper_context & ctx,
    const struct whisper_full_params   params,
              struct whisper_decoder & decoder,
                               float   temperature) {
@ -3202,9 +3232,6 @@ static void whisper_process_logits(
        logits[vocab.token_translate]  = -INFINITY;
        logits[vocab.token_transcribe] = -INFINITY;
        if (params.logits_filter_callback) {
            params.logits_filter_callback(&ctx, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
        }
        // suppress non-speech tokens
        // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
@ -3908,7 +3935,7 @@ int whisper_full(
                        return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
                    });
-                    uint32_t cur_c = 0;
+                    unsigned int cur_c = 0;
                    for (int j = 0; j < n_decoders_cur; ++j) {
                        auto & decoder = ctx->decoders[j];
@ -4393,7 +4420,7 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
 }
 int whisper_full_lang_id(struct whisper_context * ctx) {
-    return ctx->lang_id;
+    return ctx->lang_id; 
 }
 int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
@ -4866,3 +4893,258 @@ static void whisper_exp_compute_token_level_timestamps(
    //    }
    //}
 }
 //
 // diarization stuff
 //
 void whisper_full_cluster_segments(struct whisper_context * ctx) {
    const int n_segments = ctx->result_all.size();
    printf("%s: clustering %d segments\n", __func__, n_segments);
    const auto mel_len_save = ctx->mel.n_len;
    printf("%s: mel_len_save = %d\n", __func__, mel_len_save);
    const int n_ctx   = ctx->model.hparams.n_audio_ctx;
    const int n_state = ctx->model.hparams.n_audio_state;
    const int n_layer = ctx->model.hparams.n_audio_layer;
 #if 0
    // use the last layer of the encoder
    {
        std::vector<float> embd(n_segments*n_state);
        for (int i = 0; i < n_segments; ++i) {
            const auto & segment_i = ctx->result_all[i];
            printf("%s: segment %3d: t0 = %7d, t1 = %7d, text = %s\n", __func__, i, (int) segment_i.t0, (int) segment_i.t1, segment_i.text.c_str());
            ctx->mel.n_len = segment_i.t1;
            whisper_encode(*ctx, segment_i.t0, 7, true);
            for (int j = 0; j < n_state; ++j) {
                embd[i*n_state + j] = ctx->audio_embd[j];
            }
        }
        const int n_features = std::min(4, n_segments);
        ggml_svd_reduce_dims(n_state, n_segments, embd.data(), n_features);
 #elif 0
    // use cross kv cache of various layers
    for (int il = 0; il < n_layer; ++il) {
        std::vector<float> embd(n_segments*n_ctx*n_state);
        for (int i = 0; i < n_segments; ++i) {
            const auto & segment_i = ctx->result_all[i];
            printf("%s: layer %2d, segment %3d: t0 = %7d, t1 = %7d, text = %s\n", __func__, il, i, (int) segment_i.t0, (int) segment_i.t1, segment_i.text.c_str());
            ctx->mel.n_len = segment_i.t1;
            whisper_encode(*ctx, segment_i.t0, 7, true);
            const size_t offs = ggml_element_size(ctx->kv_cross.k)*(il*n_ctx*n_state);
            const ggml_fp16_t * f = (const ggml_fp16_t * )((const char *) ctx->kv_cross.k->data + offs);
            for (int j = 0; j < n_ctx*n_state; ++j) {
                embd[i*n_ctx*n_state + j] = ggml_fp16_to_fp32(f[j]);
            }
        }
        const int n_features = std::min(4, n_segments);
        ggml_svd_reduce_dims(n_ctx*n_state, n_segments, embd.data(), n_features);
 #elif 0
    // use conv embedding
    for (int il = 0; il < 1; ++il) {
        std::vector<float> embd(n_segments*n_ctx*n_state);
        for (int i = 0; i < n_segments; ++i) {
            const auto & segment_i = ctx->result_all[i];
            printf("%s: layer %2d, segment %3d: t0 = %7d, t1 = %7d, text = %s\n", __func__, il, i, (int) segment_i.t0, (int) segment_i.t1, segment_i.text.c_str());
            ctx->mel.n_len = segment_i.t1;
            whisper_encode(*ctx, segment_i.t0, 7, true);
            const size_t offs = ggml_element_size(ctx->kv_enc_self.k)*(il*n_ctx*n_state);
            const ggml_fp16_t * f = (const ggml_fp16_t * )((const char *) ctx->kv_enc_self.k->data + offs);
            for (int j = 0; j < n_ctx*n_state; ++j) {
                embd[i*n_ctx*n_state + j] = ggml_fp16_to_fp32(f[j]);
            }
        }
        const int n_features = std::min(3, n_segments);
        ggml_svd_reduce_dims(n_ctx*n_state, n_segments, embd.data(), n_features);
 #else
    // use enc self kv cache of various layers
    for (int il = 0; il < n_layer; ++il) {
        std::vector<float> embd(n_segments*n_ctx*n_state);
        for (int i = 0; i < n_segments; ++i) {
            const auto & segment_i = ctx->result_all[i];
            printf("%s: layer %2d, segment %3d: t0 = %7d, t1 = %7d, text = %s\n", __func__, il, i, (int) segment_i.t0, (int) segment_i.t1, segment_i.text.c_str());
            ctx->mel.n_len = segment_i.t1;
            whisper_encode(*ctx, segment_i.t0, 7, true);
            const size_t offs = ggml_element_size(ctx->kv_enc_self.k)*(il*n_ctx*n_state);
            const ggml_fp16_t * f = (const ggml_fp16_t * )((const char *) ctx->kv_enc_self.k->data + offs);
            for (int j = 0; j < n_ctx*n_state; ++j) {
                embd[i*n_ctx*n_state + j] = ggml_fp16_to_fp32(f[j]);
            }
        }
        const int n_features = std::min(4, n_segments);
        ggml_svd_reduce_dims(n_ctx*n_state, n_segments, embd.data(), n_features);
 #endif
        std::vector<std::vector<double>> features(n_segments);
        for (int i = 0; i < n_segments; ++i) {
            features[i].resize(n_features);
            for (int j = 0; j < n_features; ++j) {
                features[i][j] = embd[i*n_features + j];
            }
        }
        // fuzzy c-means clustering
        const int n_clusters = 2;
        std::vector<std::vector<double>> centroids(n_clusters,  std::vector<double>(n_features, 0.0));
        std::vector<std::vector<double>> membership(n_segments, std::vector<double>(n_clusters, 0.0));
        // initialize the centroids
        for (int i = 0; i < n_clusters; ++i) {
            for (int j = 0; j < n_features; ++j) {
                centroids[i][j] = features[i][j];
            }
        }
        // initialize the membership
        for (int i = 0; i < n_segments; ++i) {
            //membership[i][i % n_clusters] = 1.0;
            //for (int j = 0; j < n_clusters; ++j) {
            //    membership[i][j] = rand() / (float) RAND_MAX;
            //}
            for (int j = 0; j < n_clusters; ++j) {
                membership[i][j] = 1.0 / n_clusters;
            }
        }
        const int niter = 10000;
        // iterate
        for (int i = 0; i < niter; ++i) {
            // print the membership
            if (i == niter - 1) {
            //{
                for (int i = 0; i < n_segments; ++i) {
 #if 1
                    printf("%s: membership %3d: ", __func__, i);
                    for (int j = 0; j < n_clusters; ++j) {
                        printf("%.1f ", membership[i][j]);
                    }
                    printf(" '%s'\n", ctx->result_all[i].text.c_str());
 #else
                    printf("%s: features      : ", __func__);
                    for (int j = 0; j < n_features; ++j) {
                        printf("%8.3f ", features[i][j]);
                    }
                    printf(" '%s'\n", ctx->result_all[i].text.c_str());
 #endif
                }
                printf("----------------\n");
                // print the centroids
                for (int i = 0; i < n_clusters; ++i) {
                    printf("%s: centroid %d: ", __func__, i);
                    for (int j = 0; j < n_features; ++j) {
                        printf("%f ", centroids[i][j]);
                    }
                    printf("\n");
                }
            }
            // update the membership
            for (int j = 0; j < n_segments; ++j) {
                for (int k = 0; k < n_clusters; ++k) {
                    double sum = 0.0;
                    for (int l = 0; l < n_clusters; ++l) {
                        //sum += std::pow(whisper_distance(features[j], centroids[k])/whisper_distance(features[j], centroids[l]), 2.0/(2.0 - 1.0));
                        double d0 = 0.0;
                        double d1 = 0.0;
 #if 1
                        // use the euclidean distance
                        {
                            for (int m = 0; m < n_features; ++m) {
                                d0 += std::pow(features[j][m] - centroids[k][m], 2.0);
                            }
                            d0 = std::sqrt(d0);
                            for (int m = 0; m < n_features; ++m) {
                                d1 += std::pow(features[j][m] - centroids[l][m], 2.0);
                            }
                            d1 = std::sqrt(d1);
                        }
 #else
                        // use the cosine distance
                        {
                            double dot = 0.0;
                            double norm0 = 0.0;
                            double norm1 = 0.0;
                            for (int m = 0; m < n_features; ++m) {
                                dot += features[j][m]*centroids[k][m];
                                norm0 += std::pow(features[j][m], 2.0);
                                norm1 += std::pow(centroids[k][m], 2.0);
                            }
                            d0 = 1.0 - dot/(std::sqrt(norm0)*std::sqrt(norm1));
                            dot = 0.0;
                            norm0 = 0.0;
                            norm1 = 0.0;
                            for (int m = 0; m < n_features; ++m) {
                                dot += features[j][m]*centroids[l][m];
                                norm0 += std::pow(features[j][m], 2.0);
                                norm1 += std::pow(centroids[l][m], 2.0);
                            }
                            d1 = 1.0 - dot/(std::sqrt(norm0)*std::sqrt(norm1));
                        }
 #endif
                        if (d1 > 0.0) {
                            sum += std::pow(d0/d1, 2.0/(1.20 - 1.0));
                        } else {
                            sum += 1.0;
                        }
                    }
                    membership[j][k] = sum == 0.0 ? 1.0 : 1.0/sum;
                }
            }
            // update the centroids
            for (int j = 0; j < n_clusters; ++j) {
                for (int k = 0; k < n_features; ++k) {
                    double sum  = 0.0;
                    double sum2 = 0.0;
                    for (int l = 0; l < n_segments; ++l) {
                        sum  += membership[l][j]*features[l][k];
                        sum2 += membership[l][j];
                    }
                    centroids[j][k] = sum2 == 0.0 ? 0.0 : sum/sum2;
                }
            }
        }
    }
    // restore the mel length
    ctx->mel.n_len = mel_len_save;
 }
--- a/whisper.h
+++ b/whisper.h
@ -243,16 +243,6 @@ extern "C" {
    // If it returns false, the computation is aborted
    typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
    // Logits filter callback
    // Can be used to modify the logits before sampling
    // If not NULL, called after applying temperature to logits
    typedef void (*whisper_logits_filter_callback)(
            struct whisper_context * ctx,
          const whisper_token_data * tokens,
                               int   n_tokens,
                             float * logits,
                              void * user_data);
    // Parameters for the whisper_full() function
    // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
    // whisper_full_default_params()
@ -325,10 +315,6 @@ extern "C" {
        // called each time before the encoder starts
        whisper_encoder_begin_callback encoder_begin_callback;
        void * encoder_begin_callback_user_data;
        // called by each decoder to filter obtained logits
        whisper_logits_filter_callback logits_filter_callback;
        void * logits_filter_callback_user_data;
    };
    WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
@ -386,6 +372,10 @@ extern "C" {
    WHISPER_API int whisper_bench_memcpy(int n_threads);
    WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
    // Temporary experimental API
    WHISPER_API void whisper_full_cluster_segments(struct whisper_context * ctx);
 #ifdef __cplusplus
 }
 #endif
Author	SHA1	Message	Date
Georgi Gerganov	ec44ad0a75	diarization : try conv and self-attention embeddings	2023-02-19 13:00:12 +02:00
Georgi Gerganov	d11f35920e	diarization : try to cluster embedings from last encoder layer	2023-02-19 10:33:03 +02:00
Georgi Gerganov	d5d7769fa7	diarization : more unsuccessful clustering experiments	2023-02-18 18:36:03 +02:00
Georgi Gerganov	c2f5be7c11	diarization : some unsuccessful experiments with audio embd clustering	2023-02-18 12:16:39 +02:00