Compare commits

..

4 Commits

16 changed files with 592 additions and 697 deletions

3
.gitignore vendored
View File

@ -1,7 +1,5 @@
*.o *.o
*.a *.a
*.mlmodel
*.mlmodelc
.cache/ .cache/
.vs/ .vs/
.vscode/ .vscode/
@ -12,7 +10,6 @@ build-em/
build-debug/ build-debug/
build-release/ build-release/
build-static/ build-static/
build-no-accel/
build-sanitize-addr/ build-sanitize-addr/
build-sanitize-thread/ build-sanitize-thread/

View File

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.0) cmake_minimum_required (VERSION 3.0)
project(whisper.cpp VERSION 1.2.1) project(whisper.cpp VERSION 1.2.0)
# Add path to modules # Add path to modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/") list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
@ -54,8 +54,6 @@ if (APPLE)
option(WHISPER_NO_AVX "whisper: disable AVX" OFF) option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF) option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
option(WHISPER_NO_FMA "whisper: disable FMA" OFF) option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
else() else()
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF) option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
endif() endif()
@ -88,33 +86,16 @@ endif()
find_package(Threads REQUIRED) find_package(Threads REQUIRED)
# on APPLE # on APPLE - include Accelerate framework
if (APPLE) if (APPLE AND NOT WHISPER_NO_ACCELERATE)
# include Accelerate framework find_library(ACCELERATE_FRAMEWORK Accelerate)
if (NOT WHISPER_NO_ACCELERATE) if (ACCELERATE_FRAMEWORK)
find_library(ACCELERATE_FRAMEWORK Accelerate) message(STATUS "Accelerate framework found")
if (ACCELERATE_FRAMEWORK) set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
message(STATUS "Accelerate framework found") set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
else()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK}) message(WARNING "Accelerate framework not found")
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
else()
message(WARNING "Accelerate framework not found")
endif()
endif()
if (WHISPER_COREML)
find_library(FOUNDATION_FRAMEWORK Foundation)
find_library(COREML_FRAMEWORK CoreML)
if (COREML_FRAMEWORK)
message(STATUS "CoreML framework found")
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
else()
message(WARNING "CoreML framework not found")
endif()
endif() endif()
endif() endif()
@ -200,33 +181,6 @@ if (WHISPER_PERF)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF) set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
endif() endif()
#
# whisper.coreml - Core ML support
#
if (WHISPER_COREML)
set(TARGET whisper.coreml)
add_library(${TARGET}
coreml/whisper-encoder.h
coreml/whisper-encoder.mm
coreml/whisper-encoder-impl.h
coreml/whisper-encoder-impl.m
)
include(DefaultTargetOptions)
target_include_directories(${TARGET} PUBLIC
.
)
target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
set_target_properties(${TARGET} PROPERTIES
COMPILE_FLAGS "-fobjc-arc"
)
endif()
# #
# whisper - this is the main library of the project # whisper - this is the main library of the project
# #
@ -246,10 +200,6 @@ target_include_directories(${TARGET} PUBLIC
. .
) )
if (WHISPER_COREML)
target_link_libraries(${TARGET} PRIVATE whisper.coreml)
endif()
if (MSVC) if (MSVC)
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT}) target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})

View File

@ -30,8 +30,8 @@ endif
# Compile flags # Compile flags
# #
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC CFLAGS = -I. -O3 -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC
LDFLAGS = LDFLAGS =
# OS specific # OS specific
@ -132,10 +132,6 @@ ifndef WHISPER_NO_ACCELERATE
LDFLAGS += -framework Accelerate LDFLAGS += -framework Accelerate
endif endif
endif endif
ifdef WHISPER_COREML
CXXFLAGS += -DWHISPER_USE_COREML
LDFLAGS += -framework Foundation -framework CoreML
endif
ifdef WHISPER_OPENBLAS ifdef WHISPER_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas LDFLAGS += -lopenblas
@ -145,8 +141,6 @@ ifdef WHISPER_GPROF
CXXFLAGS += -pg CXXFLAGS += -pg
endif endif
ifneq ($(filter aarch64%,$(UNAME_M)),) ifneq ($(filter aarch64%,$(UNAME_M)),)
CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native
endif endif
ifneq ($(filter armv6%,$(UNAME_M)),) ifneq ($(filter armv6%,$(UNAME_M)),)
# Raspberry Pi 1, 2, 3 # Raspberry Pi 1, 2, 3
@ -188,23 +182,11 @@ ggml.o: ggml.c ggml.h
whisper.o: whisper.cpp whisper.h whisper.o: whisper.cpp whisper.h
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o $(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
ifndef WHISPER_COREML libwhisper.a: ggml.o whisper.o
WHISPER_OBJ = whisper.o $(AR) rcs libwhisper.a ggml.o whisper.o
else
whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h libwhisper.so: ggml.o whisper.o
$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
endif
libwhisper.a: ggml.o $(WHISPER_OBJ)
$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
libwhisper.so: ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
clean: clean:
rm -f *.o main stream command talk bench libwhisper.a libwhisper.so rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
@ -218,21 +200,21 @@ CC_SDL=`sdl2-config --cflags --libs`
SRC_COMMON = examples/common.cpp SRC_COMMON = examples/common.cpp
SRC_COMMON_SDL = examples/common-sdl.cpp SRC_COMMON_SDL = examples/common-sdl.cpp
main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
./main -h ./main -h
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) bench: examples/bench/bench.cpp ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
# #
# Audio samples # Audio samples

View File

@ -4,7 +4,7 @@
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/) [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126) Stable: [v1.2.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model: High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -469,9 +469,7 @@ in [models](models).
- [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422) - [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
- [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net) - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
- [NickDarvey/whisper](https://github.com/NickDarvey/whisper) - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9) - [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
## Examples ## Examples

View File

@ -1,6 +1,6 @@
{ {
"name": "whisper.cpp", "name": "whisper.cpp",
"version": "1.2.1", "version": "1.2.0",
"description": "Whisper speech recognition", "description": "Whisper speech recognition",
"main": "whisper.js", "main": "whisper.js",
"scripts": { "scripts": {

View File

@ -1,142 +0,0 @@
//
// CoremlEncoder.h
//
// This file was automatically generated and should not be edited.
//
#import <Foundation/Foundation.h>
#import <CoreML/CoreML.h>
#include <stdint.h>
#include <os/log.h>
NS_ASSUME_NONNULL_BEGIN
/// Model Prediction Input Type
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoderInput : NSObject<MLFeatureProvider>
/// melSegment as 1 × 80 × 3000 3-dimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * melSegment;
- (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER;
@end
/// Model Prediction Output Type
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoderOutput : NSObject<MLFeatureProvider>
/// output as multidimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * output;
- (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
@end
/// Class for model loading and prediction
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoder : NSObject
@property (readonly, nonatomic, nullable) MLModel * model;
/**
URL of the underlying .mlmodelc directory.
*/
+ (nullable NSURL *)URLOfModelInThisBundle;
/**
Initialize CoremlEncoder instance from an existing MLModel object.
Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
*/
- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
/**
Initialize CoremlEncoder instance with the model in this bundle.
*/
- (nullable instancetype)init;
/**
Initialize CoremlEncoder instance with the model in this bundle.
@param configuration The model configuration object
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Initialize CoremlEncoder instance from the model URL.
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Initialize CoremlEncoder instance from the model URL.
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
@param configuration The model configuration object
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Construct CoremlEncoder instance asynchronously with configuration.
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
@param configuration The model configuration
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
*/
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
/**
Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
@param modelURL The model URL.
@param configuration The model configuration
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
*/
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
/**
Make a prediction using the standard interface
@param input an instance of CoremlEncoderInput to predict from
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@return the prediction as CoremlEncoderOutput
*/
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Make a prediction using the standard interface
@param input an instance of CoremlEncoderInput to predict from
@param options prediction options
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@return the prediction as CoremlEncoderOutput
*/
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Make a prediction using the convenience interface
@param melSegment as 1 × 80 × 3000 3-dimensional array of floats:
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@return the prediction as CoremlEncoderOutput
*/
- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Batch prediction
@param inputArray array of CoremlEncoderInput instances to obtain predictions from
@param options prediction options
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@return the predictions as NSArray<CoremlEncoderOutput *>
*/
- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
@end
NS_ASSUME_NONNULL_END

View File

@ -1,197 +0,0 @@
//
// CoremlEncoder.m
//
// This file was automatically generated and should not be edited.
//
#if !__has_feature(objc_arc)
#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
#endif
#import "whisper-encoder-impl.h"
@implementation CoremlEncoderInput
- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment {
self = [super init];
if (self) {
_melSegment = melSegment;
}
return self;
}
- (NSSet<NSString *> *)featureNames {
return [NSSet setWithArray:@[@"melSegment"]];
}
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
if ([featureName isEqualToString:@"melSegment"]) {
return [MLFeatureValue featureValueWithMultiArray:self.melSegment];
}
return nil;
}
@end
@implementation CoremlEncoderOutput
- (instancetype)initWithOutput:(MLMultiArray *)output {
self = [super init];
if (self) {
_output = output;
}
return self;
}
- (NSSet<NSString *> *)featureNames {
return [NSSet setWithArray:@[@"output"]];
}
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
if ([featureName isEqualToString:@"output"]) {
return [MLFeatureValue featureValueWithMultiArray:self.output];
}
return nil;
}
@end
@implementation CoremlEncoder
/**
URL of the underlying .mlmodelc directory.
*/
+ (nullable NSURL *)URLOfModelInThisBundle {
NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"];
if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; }
return [NSURL fileURLWithPath:assetPath];
}
/**
Initialize CoremlEncoder instance from an existing MLModel object.
Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
*/
- (instancetype)initWithMLModel:(MLModel *)model {
self = [super init];
if (!self) { return nil; }
_model = model;
if (_model == nil) { return nil; }
return self;
}
/**
Initialize CoremlEncoder instance with the model in this bundle.
*/
- (nullable instancetype)init {
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
}
/**
Initialize CoremlEncoder instance with the model in this bundle.
@param configuration The model configuration object
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
}
/**
Initialize CoremlEncoder instance from the model URL.
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
if (model == nil) { return nil; }
return [self initWithMLModel:model];
}
/**
Initialize CoremlEncoder instance from the model URL.
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
@param configuration The model configuration object
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
if (model == nil) { return nil; }
return [self initWithMLModel:model];
}
/**
Construct CoremlEncoder instance asynchronously with configuration.
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
@param configuration The model configuration
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
*/
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
[self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
configuration:configuration
completionHandler:handler];
}
/**
Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
@param modelURL The model URL.
@param configuration The model configuration
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
*/
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
[MLModel loadContentsOfURL:modelURL
configuration:configuration
completionHandler:^(MLModel *model, NSError *error) {
if (model != nil) {
CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model];
handler(typedModel, nil);
} else {
handler(nil, error);
}
}];
}
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
}
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
if (!outFeatures) { return nil; }
return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
}
- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error {
CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment];
return [self predictionFromFeatures:input_ error:error];
}
- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
if (!outBatch) { return nil; }
NSMutableArray<CoremlEncoderOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
for (NSInteger i = 0; i < outBatch.count; i++) {
id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
[results addObject:result];
}
return results;
}
@end

View File

@ -1,22 +0,0 @@
// Wrapper of the Core ML Whisper Encoder model
//
// Code is derived from the work of Github user @wangchou
// ref: https://github.com/wangchou/callCoreMLFromCpp
#if __cplusplus
extern "C" {
#endif
struct whisper_coreml_context;
struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
void whisper_coreml_free(struct whisper_coreml_context * ctx);
void whisper_coreml_encode(
const whisper_coreml_context * ctx,
float * mel,
float * out);
#if __cplusplus
}
#endif

View File

@ -1,61 +0,0 @@
#import "coreml/whisper-encoder.h"
#import "coreml/whisper-encoder-impl.h"
#import <CoreML/CoreML.h>
#include <stdlib.h>
#if __cplusplus
extern "C" {
#endif
struct whisper_coreml_context {
const void * data;
};
struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]);
if (data == NULL) {
return NULL;
}
whisper_coreml_context * ctx = new whisper_coreml_context;
ctx->data = data;
return ctx;
}
void whisper_coreml_free(struct whisper_coreml_context * ctx) {
CFRelease(ctx->data);
delete ctx;
}
void whisper_coreml_encode(
const whisper_coreml_context * ctx,
float * mel,
float * out) {
MLMultiArray * inMultiArray = [
[MLMultiArray alloc] initWithDataPointer: mel
shape: @[@1, @80, @3000]
dataType: MLMultiArrayDataTypeFloat32
strides: @[@(240000), @(3000), @1]
deallocator: nil
error: nil
];
CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil];
MLMultiArray * outMA = outCoreML.output;
memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
}
#if __cplusplus
}
#endif

View File

@ -352,14 +352,13 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname); fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
const int n_segments = whisper_full_n_segments(ctx); const int n_segments = whisper_full_n_segments(ctx);
fout << "start,end,text\n";
for (int i = 0; i < n_segments; ++i) { for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i); const char * text = whisper_full_get_segment_text(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i); const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i); const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
//need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds. //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
fout << 10 * t0 << "," << 10 * t1 << ",\"" << text << "\"\n"; fout << 10 * t0 << ", " << 10 * t1 << ", \"" << text << "\"\n";
} }
return true; return true;
@ -619,6 +618,8 @@ int main(int argc, char ** argv) {
fprintf(stderr, "%s: failed to process audio\n", argv[0]); fprintf(stderr, "%s: failed to process audio\n", argv[0]);
return 10; return 10;
} }
whisper_full_cluster_segments(ctx);
} }
// output stuff // output stuff

189
ggml.c
View File

@ -8517,6 +8517,195 @@ enum ggml_opt_result ggml_opt(
//////////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////////
void ggml_svd_reduce_dims(
int ne0,
int ne1,
float * a,
int nd) {
int n = ne1;
int m = ne0;
float * A = a;
float * A0 = (float *) malloc(n * m * sizeof(float));
// average vector
//float * M = (float *) malloc(m * sizeof(float));
//{
// for (int j = 0; j < m; ++j) {
// M[j] = 0.0f;
// }
// for (int i = 0; i < n; ++i) {
// for (int j = 0; j < m; ++j) {
// M[j] += A[i * m + j];
// }
// }
// for (int j = 0; j < m; ++j) {
// M[j] /= (float) n;
// }
//}
//// subtract average vector
//for (int i = 0; i < n; ++i) {
// for (int j = 0; j < m; ++j) {
// A[i * m + j] -= M[j];
// }
//}
//free(M);
memcpy(A0, A, n * m * sizeof(float));
// print A
//printf("A:\n");
//for (int i = 0; i < n; ++i) {
// printf("col %d : ", i);
// for (int j = 0; j < m; ++j) {
// printf("%9.5f ", A[i * m + j]);
// }
// printf("\n");
//}
//printf("\n");
// SVD
// A = U * S * V^T
float * U = (float *) malloc(n * m * sizeof(float));
float * S = (float *) malloc(n * sizeof(float));
float * V = (float *) malloc(n * n * sizeof(float));
int lda = m;
int ldu = m;
int ldvt = n;
float work_size;
int lwork = -1;
int info = 0;
sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, &work_size, &lwork, &info);
lwork = (int) work_size;
//printf("work_size = %f, info = %d, lwork = %d\n", work_size, info, lwork);
float * work = (float *) malloc(lwork * sizeof(float));
sgesvd_("S", "S", &m, &n, A, &lda, S, U, &ldu, V, &ldvt, work, &lwork, &info);
free(work);
// print U
//printf("U:\n");
//for (int i = 0; i < n; ++i) {
// printf("col %d : ", i);
// for (int j = 0; j < m; ++j) {
// printf("%9.5f ", U[i * m + j]);
// }
// printf("\n");
//}
//printf("\n");
// normalize S
{
double sum = 0.0;
for (int i = 0; i < n; ++i) {
sum += S[i];
}
sum *= sqrt((double) m);
for (int i = 0; i < n; ++i) {
S[i] /= sum;
}
}
// print S
printf("S:\n");
for (int i = 0; i < n; ++i) {
printf("- %d = %9.5f\n", i, S[i]);
}
printf("\n");
// print V
//printf("V:\n");
//for (int i = 0; i < n; ++i) {
// printf("col %d : ", i);
// for (int j = 0; j < n; ++j) {
// printf("%9.5f ", V[i * n + j]);
// }
// printf("\n");
//}
//printf("\n");
// print A
//printf("A:\n");
//for (int i = 0; i < n; ++i) {
// printf("col %d : ", i);
// for (int j = 0; j < m; ++j) {
// printf("%9.5f ", A[i * m + j]);
// }
// printf("\n");
//}
//printf("\n");
// compute singular vectors in U
for (int i = 0; i < n; ++i) {
for (int j = 0; j < m; ++j) {
U[i * m + j] *= S[i];
}
}
// normalize U
for (int i = 0; i < n; ++i) {
double sum = 0.0;
for (int j = 0; j < m; ++j) {
sum += U[i * m + j] * U[i * m + j];
}
sum = sqrt(sum);
for (int j = 0; j < m; ++j) {
U[i * m + j] /= sum*sqrt((double) m);
}
}
// print U
//printf("U:\n");
//for (int i = 0; i < n; ++i) {
// printf("col %d : ", i);
// for (int j = 0; j < m; ++j) {
// printf("%9.5f ", U[i * m + j]);
// }
// printf("\n");
//}
//printf("\n");
// project A0 onto U
for (int i = 0; i < n; ++i) {
for (int j = 0; j < nd; ++j) {
A[i * nd + j] = 0.0f;
//if (j == 0) continue;
for (int k = 0; k < m; ++k) {
A[i * nd + j] += A0[i * m + k] * U[j * m + k];
}
}
}
// print A
//printf("A:\n");
//for (int i = 0; i < n; ++i) {
// printf("col %d : ", i);
// for (int j = 0; j < n; ++j) {
// printf("%9.5f ", A[i * n + j]);
// }
// printf("\n");
//}
//printf("\n");
free(U);
free(S);
free(V);
free(A0);
}
////////////////////////////////////////////////////////////////////////////////
int ggml_cpu_has_avx(void) { int ggml_cpu_has_avx(void) {
#if defined(__AVX__) #if defined(__AVX__)
return 1; return 1;

10
ggml.h
View File

@ -726,6 +726,16 @@ enum ggml_opt_result ggml_opt(
struct ggml_opt_params params, struct ggml_opt_params params,
struct ggml_tensor * f); struct ggml_tensor * f);
//
// Temp stuff
//
void ggml_svd_reduce_dims(
int ne0,
int ne1,
float * a,
int nd);
// //
// system info // system info
// //

View File

@ -1,82 +0,0 @@
#!/bin/bash
# This script downloads Whisper model files that have already been converted to Core ML format.
# This way you don't have to convert them yourself.
src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
pfx="resolve/main/ggml"
# get the path of this script
function get_script_path() {
if [ -x "$(command -v realpath)" ]; then
echo "$(dirname $(realpath $0))"
else
local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
echo "$ret"
fi
}
models_path="$(get_script_path)"
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
# list available models
function list_models {
printf "\n"
printf " Available models:"
for model in "${models[@]}"; do
printf " $model"
done
printf "\n\n"
}
if [ "$#" -ne 1 ]; then
printf "Usage: $0 <model>\n"
list_models
exit 1
fi
model=$1
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
printf "Invalid model: $model\n"
list_models
exit 1
fi
# download Core ML model
printf "Downloading Core ML model $model from '$src' ...\n"
cd $models_path
if [ -f "ggml-$model.mlmodel" ]; then
printf "Model $model already exists. Skipping download.\n"
exit 0
fi
if [ -x "$(command -v wget)" ]; then
wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
elif [ -x "$(command -v curl)" ]; then
curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
else
printf "Either wget or curl is required to download models.\n"
exit 1
fi
if [ $? -ne 0 ]; then
printf "Failed to download Core ML model $model \n"
printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
exit 1
fi
printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
printf "Run the following command to compile it:\n\n"
printf " $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
printf "You can now use it like this:\n\n"
printf " $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
printf "\n"

View File

@ -1,8 +1,5 @@
#define WHISPER_BUILD #define WHISPER_BUILD
#include "whisper.h" #include "whisper.h"
#if WHISPER_USE_COREML
#include "coreml/whisper-encoder.h"
#endif
#include "ggml.h" #include "ggml.h"
@ -271,6 +268,14 @@ static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
{ MODEL_LARGE, 71ull*MB }, { MODEL_LARGE, 71ull*MB },
}; };
static const std::map<e_model, size_t> MEM_REQ_KV_ENC_SELF = {
{ MODEL_TINY, 23ull*MB },
{ MODEL_BASE, 26ull*MB },
{ MODEL_SMALL, 216ull*MB },
{ MODEL_MEDIUM, 243ull*MB },
{ MODEL_LARGE, 271ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = { static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
{ MODEL_TINY, 9ull*MB }, { MODEL_TINY, 9ull*MB },
{ MODEL_BASE, 18ull*MB }, { MODEL_BASE, 18ull*MB },
@ -574,6 +579,7 @@ struct whisper_context {
// cross-attention KV cache for the decoders // cross-attention KV cache for the decoders
// shared between all decoders // shared between all decoders
whisper_kv_cache kv_cross; whisper_kv_cache kv_cross;
whisper_kv_cache kv_enc_self;
whisper_decoder decoders[WHISPER_MAX_DECODERS] = {}; whisper_decoder decoders[WHISPER_MAX_DECODERS] = {};
@ -595,21 +601,18 @@ struct whisper_context {
mutable std::mt19937 rng; // used for sampling at t > 0.0 mutable std::mt19937 rng; // used for sampling at t > 0.0
int lang_id = 0; // english by default int lang_id;
std::string path_model; // populated by whisper_init_from_file()
#ifdef WHISPER_USE_COREML
whisper_coreml_context * ctx_coreml;
#endif
// [EXPERIMENTAL] token-level timestamps data // [EXPERIMENTAL] token-level timestamps data
int64_t t_beg = 0; int64_t t_beg;
int64_t t_last = 0; int64_t t_last;
whisper_token tid_last; whisper_token tid_last;
std::vector<float> energy; // PCM signal energy std::vector<float> energy; // PCM signal energy
// [EXPERIMENTAL] speed-up techniques // [EXPERIMENTAL] speed-up techniques
int32_t exp_n_audio_ctx = 0; // 0 - use default int32_t exp_n_audio_ctx; // 0 - use default
std::vector<float> audio_embd;
void use_buf(struct ggml_context * ctx, int i) { void use_buf(struct ggml_context * ctx, int i) {
#if defined(WHISPER_USE_SCRATCH) #if defined(WHISPER_USE_SCRATCH)
@ -844,6 +847,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
return false; return false;
} }
if (!kv_cache_init(model.hparams, scale*MEM_REQ_KV_ENC_SELF.at(model.type), wctx.kv_enc_self, wctx.wtype, model.hparams.n_audio_ctx)) {
fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
return false;
}
{ {
const size_t memory_size = ggml_nbytes(wctx.kv_cross.k) + ggml_nbytes(wctx.kv_cross.v); const size_t memory_size = ggml_nbytes(wctx.kv_cross.k) + ggml_nbytes(wctx.kv_cross.v);
fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size/1024.0/1024.0); fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size/1024.0/1024.0);
@ -1366,7 +1374,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
static bool whisper_encode( static bool whisper_encode(
whisper_context & wctx, whisper_context & wctx,
const int mel_offset, const int mel_offset,
const int n_threads) { const int n_threads,
bool repeat = false) {
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_time_us();
const auto & model = wctx.model; const auto & model = wctx.model;
@ -1398,14 +1407,31 @@ static bool whisper_encode(
const int i0 = std::min(mel_offset, mel_inp.n_len); const int i0 = std::min(mel_offset, mel_inp.n_len);
const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len); const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
for (int j = 0; j < mel_inp.n_mel; ++j) { if (repeat == false) {
for (int i = i0; i < i1; ++i) { for (int j = 0; j < mel_inp.n_mel; ++j) {
dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i]; for (int i = i0; i < i1; ++i) {
dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
}
}
} else {
for (int j = 0; j < mel_inp.n_mel; ++j) {
int k = 0;
while (k < 2*n_ctx) {
for (int i = i0; i < i1; ++i) {
dst[j*2*n_ctx + k] = mel_inp.data[j*mel_inp.n_len + i];
k++;
if (k >= 2*n_ctx) {
break;
}
}
}
} }
} }
} }
#ifndef WHISPER_USE_COREML struct ggml_cgraph gf = {};
gf.n_threads = n_threads;
struct ggml_tensor * cur; struct ggml_tensor * cur;
// convolution + gelu // convolution + gelu
@ -1433,6 +1459,18 @@ static bool whisper_encode(
cur = ggml_gelu(ctx0, cur); cur = ggml_gelu(ctx0, cur);
} }
//{
// //printf("cur: %d %d %d %d, size element = %d\n", cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_element_size(cur));
// wctx.use_buf(ctx0, -1);
// struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_enc_self.k, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.k)*n_state)*(0*n_ctx));
// //struct ggml_tensor * v = ggml_view_1d(ctx0, wctx.kv_enc_self.v, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.v)*n_state)*(il*n_ctx));
// ggml_build_forward_expand(&gf, ggml_cpy(ctx0, cur, k));
// //ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
//}
wctx.use_buf(ctx0, 3); wctx.use_buf(ctx0, 3);
// =================================================================== // ===================================================================
@ -1513,6 +1551,18 @@ static bool whisper_encode(
Vcur), Vcur),
Vcur); Vcur);
//{
// //printf("Kcur: %d %d %d %d, size element = %d\n", Kcur->ne[0], Kcur->ne[1], Kcur->ne[2], Kcur->ne[3], ggml_element_size(Kcur));
// wctx.use_buf(ctx0, -1);
// struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_enc_self.k, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.k)*n_state)*(il*n_ctx));
// struct ggml_tensor * v = ggml_view_1d(ctx0, wctx.kv_enc_self.v, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.v)*n_state)*(il*n_ctx));
// ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
// ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
//}
// ------ // ------
wctx.use_buf(ctx0, 0); wctx.use_buf(ctx0, 0);
@ -1597,6 +1647,18 @@ static bool whisper_encode(
cur = ggml_cpy(ctx0, cur = ggml_cpy(ctx0,
KQV_merged, KQV_merged,
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx)); ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
{
//printf("cur: %d %d %d %d, size element = %d\n", cur->ne[0], cur->ne[1], cur->ne[2], cur->ne[3], ggml_element_size(cur));
wctx.use_buf(ctx0, -1);
struct ggml_tensor * k = ggml_view_1d(ctx0, wctx.kv_enc_self.k, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.k)*n_state)*(il*n_ctx));
//struct ggml_tensor * v = ggml_view_1d(ctx0, wctx.kv_enc_self.v, n_state*n_ctx, (ggml_element_size(wctx.kv_enc_self.v)*n_state)*(il*n_ctx));
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, cur, k));
//ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
}
} }
// projection // projection
@ -1706,21 +1768,12 @@ static bool whisper_encode(
// run the computation // run the computation
{ {
struct ggml_cgraph gf = {};
gf.n_threads = n_threads;
ggml_build_forward_expand(&gf, cur); ggml_build_forward_expand(&gf, cur);
ggml_graph_compute (ctx0, &gf); ggml_graph_compute (ctx0, &gf);
//ggml_graph_print(&gf); //ggml_graph_print(&gf);
} }
#else
wctx.use_buf(ctx0, -1);
struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
whisper_coreml_encode(wctx.ctx_coreml, (float *) mel->data, (float *) cur->data);
#endif
// cur // cur
//{ //{
@ -1736,6 +1789,24 @@ static bool whisper_encode(
// printf("\n"); // printf("\n");
//} //}
{
//const int i0 = std::min(mel_offset, mel_inp.n_len);
//const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
const int i0 = 0;
const int i1 = cur->ne[1];
//printf("i0 = %d, i1 = %d, (i1 - i0) = %d, embd size = %d\n", i0, i1, i1 - i0, cur->ne[0]);
wctx.audio_embd.clear();
wctx.audio_embd.resize(cur->ne[0], 0.0f);
for (int j = 0; j < cur->ne[0]; ++j) {
for (int i = i0; i < i1; ++i) {
wctx.audio_embd[j] += ((float *)(cur->data))[(i - i0)*cur->ne[0] + j];
}
wctx.audio_embd[j] /= (i1 - i0);
}
}
// pre-compute cross-attention memory // pre-compute cross-attention memory
{ {
struct ggml_cgraph gf = {}; struct ggml_cgraph gf = {};
@ -2523,20 +2594,6 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
// interface implementation // interface implementation
// //
#ifdef WHISPER_USE_COREML
// replace .bin with .mlmodelc
static std::string whisper_get_coreml_path(std::string path_bin) {
auto pos = path_bin.rfind('.');
if (pos != std::string::npos) {
path_bin = path_bin.substr(0, pos);
}
path_bin += ".mlmodelc";
return path_bin;
}
#endif
struct whisper_context * whisper_init_from_file(const char * path_model) { struct whisper_context * whisper_init_from_file(const char * path_model) {
whisper_model_loader loader = {}; whisper_model_loader loader = {};
@ -2549,7 +2606,6 @@ struct whisper_context * whisper_init_from_file(const char * path_model) {
} }
loader.context = &fin; loader.context = &fin;
loader.read = [](void * ctx, void * output, size_t read_size) { loader.read = [](void * ctx, void * output, size_t read_size) {
std::ifstream * fin = (std::ifstream*)ctx; std::ifstream * fin = (std::ifstream*)ctx;
fin->read((char *)output, read_size); fin->read((char *)output, read_size);
@ -2566,26 +2622,7 @@ struct whisper_context * whisper_init_from_file(const char * path_model) {
fin->close(); fin->close();
}; };
auto ctx = whisper_init(&loader); return whisper_init(&loader);
if (ctx) {
ctx->path_model = path_model;
#ifdef WHISPER_USE_COREML
const auto path_coreml = whisper_get_coreml_path(ctx->path_model);
fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
ctx->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
if (!ctx->ctx_coreml) {
fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
return nullptr;
}
fprintf(stderr, "%s: Core ML model loaded\n", __func__);
#endif
}
return ctx;
} }
struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) { struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) {
@ -2657,10 +2694,6 @@ void whisper_free(struct whisper_context * ctx) {
ggml_free(ctx->decoders[i].kv_self.ctx); ggml_free(ctx->decoders[i].kv_self.ctx);
} }
} }
#ifdef WHISPER_USE_COREML
whisper_coreml_free(ctx->ctx_coreml);
ctx->ctx_coreml = nullptr;
#endif
delete ctx; delete ctx;
} }
} }
@ -3016,9 +3049,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.encoder_begin_callback =*/ nullptr, /*.encoder_begin_callback =*/ nullptr,
/*.encoder_begin_callback_user_data =*/ nullptr, /*.encoder_begin_callback_user_data =*/ nullptr,
/*.logits_filter_callback =*/ nullptr,
/*.logits_filter_callback_user_data =*/ nullptr,
}; };
switch (strategy) { switch (strategy) {
@ -3146,7 +3176,7 @@ static const std::vector<std::string> non_speech_tokens = {
// - applies logit filters // - applies logit filters
// - computes logprobs and probs // - computes logprobs and probs
static void whisper_process_logits( static void whisper_process_logits(
struct whisper_context & ctx, const struct whisper_context & ctx,
const struct whisper_full_params params, const struct whisper_full_params params,
struct whisper_decoder & decoder, struct whisper_decoder & decoder,
float temperature) { float temperature) {
@ -3202,9 +3232,6 @@ static void whisper_process_logits(
logits[vocab.token_translate] = -INFINITY; logits[vocab.token_translate] = -INFINITY;
logits[vocab.token_transcribe] = -INFINITY; logits[vocab.token_transcribe] = -INFINITY;
if (params.logits_filter_callback) {
params.logits_filter_callback(&ctx, tokens_cur.data(), tokens_cur.size(), logits.data(), params.logits_filter_callback_user_data);
}
// suppress non-speech tokens // suppress non-speech tokens
// ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253 // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
@ -3908,7 +3935,7 @@ int whisper_full(
return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all; return a.sequence.sum_logprobs_all > b.sequence.sum_logprobs_all;
}); });
uint32_t cur_c = 0; unsigned int cur_c = 0;
for (int j = 0; j < n_decoders_cur; ++j) { for (int j = 0; j < n_decoders_cur; ++j) {
auto & decoder = ctx->decoders[j]; auto & decoder = ctx->decoders[j];
@ -4393,7 +4420,7 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
} }
int whisper_full_lang_id(struct whisper_context * ctx) { int whisper_full_lang_id(struct whisper_context * ctx) {
return ctx->lang_id; return ctx->lang_id;
} }
int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) { int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
@ -4866,3 +4893,258 @@ static void whisper_exp_compute_token_level_timestamps(
// } // }
//} //}
} }
//
// diarization stuff
//
void whisper_full_cluster_segments(struct whisper_context * ctx) {
const int n_segments = ctx->result_all.size();
printf("%s: clustering %d segments\n", __func__, n_segments);
const auto mel_len_save = ctx->mel.n_len;
printf("%s: mel_len_save = %d\n", __func__, mel_len_save);
const int n_ctx = ctx->model.hparams.n_audio_ctx;
const int n_state = ctx->model.hparams.n_audio_state;
const int n_layer = ctx->model.hparams.n_audio_layer;
#if 0
// use the last layer of the encoder
{
std::vector<float> embd(n_segments*n_state);
for (int i = 0; i < n_segments; ++i) {
const auto & segment_i = ctx->result_all[i];
printf("%s: segment %3d: t0 = %7d, t1 = %7d, text = %s\n", __func__, i, (int) segment_i.t0, (int) segment_i.t1, segment_i.text.c_str());
ctx->mel.n_len = segment_i.t1;
whisper_encode(*ctx, segment_i.t0, 7, true);
for (int j = 0; j < n_state; ++j) {
embd[i*n_state + j] = ctx->audio_embd[j];
}
}
const int n_features = std::min(4, n_segments);
ggml_svd_reduce_dims(n_state, n_segments, embd.data(), n_features);
#elif 0
// use cross kv cache of various layers
for (int il = 0; il < n_layer; ++il) {
std::vector<float> embd(n_segments*n_ctx*n_state);
for (int i = 0; i < n_segments; ++i) {
const auto & segment_i = ctx->result_all[i];
printf("%s: layer %2d, segment %3d: t0 = %7d, t1 = %7d, text = %s\n", __func__, il, i, (int) segment_i.t0, (int) segment_i.t1, segment_i.text.c_str());
ctx->mel.n_len = segment_i.t1;
whisper_encode(*ctx, segment_i.t0, 7, true);
const size_t offs = ggml_element_size(ctx->kv_cross.k)*(il*n_ctx*n_state);
const ggml_fp16_t * f = (const ggml_fp16_t * )((const char *) ctx->kv_cross.k->data + offs);
for (int j = 0; j < n_ctx*n_state; ++j) {
embd[i*n_ctx*n_state + j] = ggml_fp16_to_fp32(f[j]);
}
}
const int n_features = std::min(4, n_segments);
ggml_svd_reduce_dims(n_ctx*n_state, n_segments, embd.data(), n_features);
#elif 0
// use conv embedding
for (int il = 0; il < 1; ++il) {
std::vector<float> embd(n_segments*n_ctx*n_state);
for (int i = 0; i < n_segments; ++i) {
const auto & segment_i = ctx->result_all[i];
printf("%s: layer %2d, segment %3d: t0 = %7d, t1 = %7d, text = %s\n", __func__, il, i, (int) segment_i.t0, (int) segment_i.t1, segment_i.text.c_str());
ctx->mel.n_len = segment_i.t1;
whisper_encode(*ctx, segment_i.t0, 7, true);
const size_t offs = ggml_element_size(ctx->kv_enc_self.k)*(il*n_ctx*n_state);
const ggml_fp16_t * f = (const ggml_fp16_t * )((const char *) ctx->kv_enc_self.k->data + offs);
for (int j = 0; j < n_ctx*n_state; ++j) {
embd[i*n_ctx*n_state + j] = ggml_fp16_to_fp32(f[j]);
}
}
const int n_features = std::min(3, n_segments);
ggml_svd_reduce_dims(n_ctx*n_state, n_segments, embd.data(), n_features);
#else
// use enc self kv cache of various layers
for (int il = 0; il < n_layer; ++il) {
std::vector<float> embd(n_segments*n_ctx*n_state);
for (int i = 0; i < n_segments; ++i) {
const auto & segment_i = ctx->result_all[i];
printf("%s: layer %2d, segment %3d: t0 = %7d, t1 = %7d, text = %s\n", __func__, il, i, (int) segment_i.t0, (int) segment_i.t1, segment_i.text.c_str());
ctx->mel.n_len = segment_i.t1;
whisper_encode(*ctx, segment_i.t0, 7, true);
const size_t offs = ggml_element_size(ctx->kv_enc_self.k)*(il*n_ctx*n_state);
const ggml_fp16_t * f = (const ggml_fp16_t * )((const char *) ctx->kv_enc_self.k->data + offs);
for (int j = 0; j < n_ctx*n_state; ++j) {
embd[i*n_ctx*n_state + j] = ggml_fp16_to_fp32(f[j]);
}
}
const int n_features = std::min(4, n_segments);
ggml_svd_reduce_dims(n_ctx*n_state, n_segments, embd.data(), n_features);
#endif
std::vector<std::vector<double>> features(n_segments);
for (int i = 0; i < n_segments; ++i) {
features[i].resize(n_features);
for (int j = 0; j < n_features; ++j) {
features[i][j] = embd[i*n_features + j];
}
}
// fuzzy c-means clustering
const int n_clusters = 2;
std::vector<std::vector<double>> centroids(n_clusters, std::vector<double>(n_features, 0.0));
std::vector<std::vector<double>> membership(n_segments, std::vector<double>(n_clusters, 0.0));
// initialize the centroids
for (int i = 0; i < n_clusters; ++i) {
for (int j = 0; j < n_features; ++j) {
centroids[i][j] = features[i][j];
}
}
// initialize the membership
for (int i = 0; i < n_segments; ++i) {
//membership[i][i % n_clusters] = 1.0;
//for (int j = 0; j < n_clusters; ++j) {
// membership[i][j] = rand() / (float) RAND_MAX;
//}
for (int j = 0; j < n_clusters; ++j) {
membership[i][j] = 1.0 / n_clusters;
}
}
const int niter = 10000;
// iterate
for (int i = 0; i < niter; ++i) {
// print the membership
if (i == niter - 1) {
//{
for (int i = 0; i < n_segments; ++i) {
#if 1
printf("%s: membership %3d: ", __func__, i);
for (int j = 0; j < n_clusters; ++j) {
printf("%.1f ", membership[i][j]);
}
printf(" '%s'\n", ctx->result_all[i].text.c_str());
#else
printf("%s: features : ", __func__);
for (int j = 0; j < n_features; ++j) {
printf("%8.3f ", features[i][j]);
}
printf(" '%s'\n", ctx->result_all[i].text.c_str());
#endif
}
printf("----------------\n");
// print the centroids
for (int i = 0; i < n_clusters; ++i) {
printf("%s: centroid %d: ", __func__, i);
for (int j = 0; j < n_features; ++j) {
printf("%f ", centroids[i][j]);
}
printf("\n");
}
}
// update the membership
for (int j = 0; j < n_segments; ++j) {
for (int k = 0; k < n_clusters; ++k) {
double sum = 0.0;
for (int l = 0; l < n_clusters; ++l) {
//sum += std::pow(whisper_distance(features[j], centroids[k])/whisper_distance(features[j], centroids[l]), 2.0/(2.0 - 1.0));
double d0 = 0.0;
double d1 = 0.0;
#if 1
// use the euclidean distance
{
for (int m = 0; m < n_features; ++m) {
d0 += std::pow(features[j][m] - centroids[k][m], 2.0);
}
d0 = std::sqrt(d0);
for (int m = 0; m < n_features; ++m) {
d1 += std::pow(features[j][m] - centroids[l][m], 2.0);
}
d1 = std::sqrt(d1);
}
#else
// use the cosine distance
{
double dot = 0.0;
double norm0 = 0.0;
double norm1 = 0.0;
for (int m = 0; m < n_features; ++m) {
dot += features[j][m]*centroids[k][m];
norm0 += std::pow(features[j][m], 2.0);
norm1 += std::pow(centroids[k][m], 2.0);
}
d0 = 1.0 - dot/(std::sqrt(norm0)*std::sqrt(norm1));
dot = 0.0;
norm0 = 0.0;
norm1 = 0.0;
for (int m = 0; m < n_features; ++m) {
dot += features[j][m]*centroids[l][m];
norm0 += std::pow(features[j][m], 2.0);
norm1 += std::pow(centroids[l][m], 2.0);
}
d1 = 1.0 - dot/(std::sqrt(norm0)*std::sqrt(norm1));
}
#endif
if (d1 > 0.0) {
sum += std::pow(d0/d1, 2.0/(1.20 - 1.0));
} else {
sum += 1.0;
}
}
membership[j][k] = sum == 0.0 ? 1.0 : 1.0/sum;
}
}
// update the centroids
for (int j = 0; j < n_clusters; ++j) {
for (int k = 0; k < n_features; ++k) {
double sum = 0.0;
double sum2 = 0.0;
for (int l = 0; l < n_segments; ++l) {
sum += membership[l][j]*features[l][k];
sum2 += membership[l][j];
}
centroids[j][k] = sum2 == 0.0 ? 0.0 : sum/sum2;
}
}
}
}
// restore the mel length
ctx->mel.n_len = mel_len_save;
}

View File

@ -243,16 +243,6 @@ extern "C" {
// If it returns false, the computation is aborted // If it returns false, the computation is aborted
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data); typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
// Logits filter callback
// Can be used to modify the logits before sampling
// If not NULL, called after applying temperature to logits
typedef void (*whisper_logits_filter_callback)(
struct whisper_context * ctx,
const whisper_token_data * tokens,
int n_tokens,
float * logits,
void * user_data);
// Parameters for the whisper_full() function // Parameters for the whisper_full() function
// If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp: // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
// whisper_full_default_params() // whisper_full_default_params()
@ -325,10 +315,6 @@ extern "C" {
// called each time before the encoder starts // called each time before the encoder starts
whisper_encoder_begin_callback encoder_begin_callback; whisper_encoder_begin_callback encoder_begin_callback;
void * encoder_begin_callback_user_data; void * encoder_begin_callback_user_data;
// called by each decoder to filter obtained logits
whisper_logits_filter_callback logits_filter_callback;
void * logits_filter_callback_user_data;
}; };
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy); WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
@ -386,6 +372,10 @@ extern "C" {
WHISPER_API int whisper_bench_memcpy(int n_threads); WHISPER_API int whisper_bench_memcpy(int n_threads);
WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads); WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
// Temporary experimental API
WHISPER_API void whisper_full_cluster_segments(struct whisper_context * ctx);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif