Compare commits

..

1 Commits

Author SHA1 Message Date
4f074fb7a8 tmp : demonstrate how to measure time of ggml ops 2023-03-09 09:28:06 +02:00
26 changed files with 102 additions and 978 deletions

2
.gitignore vendored
View File

@ -1,7 +1,5 @@
*.o
*.a
*.mlmodel
*.mlmodelc
.cache/
.vs/
.vscode/

View File

@ -54,8 +54,6 @@ if (APPLE)
option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
else()
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF)
endif()
@ -88,33 +86,16 @@ endif()
find_package(Threads REQUIRED)
# on APPLE
if (APPLE)
# include Accelerate framework
if (NOT WHISPER_NO_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
# on APPLE - include Accelerate framework
if (APPLE AND NOT WHISPER_NO_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found")
if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found")
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
else()
message(WARNING "Accelerate framework not found")
endif()
endif()
if (WHISPER_COREML)
find_library(FOUNDATION_FRAMEWORK Foundation)
find_library(COREML_FRAMEWORK CoreML)
if (COREML_FRAMEWORK)
message(STATUS "CoreML framework found")
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
else()
message(WARNING "CoreML framework not found")
endif()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE)
else()
message(WARNING "Accelerate framework not found")
endif()
endif()
@ -191,9 +172,7 @@ else()
if(NOT WHISPER_NO_FMA)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
if(NOT WHISPER_NO_F16C)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif()
endif()
endif()
@ -202,33 +181,6 @@ if (WHISPER_PERF)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
endif()
#
# whisper.coreml - Core ML support
#
if (WHISPER_COREML)
set(TARGET whisper.coreml)
add_library(${TARGET}
coreml/whisper-encoder.h
coreml/whisper-encoder.mm
coreml/whisper-encoder-impl.h
coreml/whisper-encoder-impl.m
)
include(DefaultTargetOptions)
target_include_directories(${TARGET} PUBLIC
.
)
target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
set_target_properties(${TARGET} PROPERTIES
COMPILE_FLAGS "-fobjc-arc"
)
endif()
#
# whisper - this is the main library of the project
#
@ -248,10 +200,6 @@ target_include_directories(${TARGET} PUBLIC
.
)
if (WHISPER_COREML)
target_link_libraries(${TARGET} PRIVATE whisper.coreml)
endif()
if (MSVC)
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})

View File

@ -34,12 +34,6 @@ CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =
# ref: https://github.com/ggerganov/whisper.cpp/issues/37
ifneq ($(wildcard /usr/include/musl/*),)
CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
endif
# OS specific
# TODO: support Windows
ifeq ($(UNAME_S),Linux)
@ -138,10 +132,6 @@ ifndef WHISPER_NO_ACCELERATE
LDFLAGS += -framework Accelerate
endif
endif
ifdef WHISPER_COREML
CXXFLAGS += -DWHISPER_USE_COREML
LDFLAGS += -framework Foundation -framework CoreML
endif
ifdef WHISPER_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas
@ -194,23 +184,11 @@ ggml.o: ggml.c ggml.h
whisper.o: whisper.cpp whisper.h
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
ifndef WHISPER_COREML
WHISPER_OBJ = whisper.o
else
whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
libwhisper.a: ggml.o whisper.o
$(AR) rcs libwhisper.a ggml.o whisper.o
whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
endif
libwhisper.a: ggml.o $(WHISPER_OBJ)
$(AR) rcs libwhisper.a ggml.o $(WHISPER_OBJ)
libwhisper.so: ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
libwhisper.so: ggml.o whisper.o
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o whisper.o $(LDFLAGS)
clean:
rm -f *.o main stream command talk bench libwhisper.a libwhisper.so
@ -224,21 +202,21 @@ CC_SDL=`sdl2-config --cflags --libs`
SRC_COMMON = examples/common.cpp
SRC_COMMON_SDL = examples/common-sdl.cpp
main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o main $(LDFLAGS)
main: examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/main/main.cpp $(SRC_COMMON) ggml.o whisper.o -o main $(LDFLAGS)
./main -h
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o stream $(CC_SDL) $(LDFLAGS)
command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o command $(CC_SDL) $(LDFLAGS)
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o whisper.o -o talk $(CC_SDL) $(LDFLAGS)
bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
bench: examples/bench/bench.cpp ggml.o whisper.o
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o whisper.o -o bench $(LDFLAGS)
#
# Audio samples

View File

@ -466,7 +466,7 @@ The original models are converted to a custom binary format. This allows to pack
You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
or manually from here:
- https://huggingface.co/ggerganov/whisper.cpp
- https://huggingface.co/datasets/ggerganov/whisper.cpp
- https://ggml.ggerganov.com
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
@ -476,7 +476,6 @@ in [models](models).
- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
- [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
- React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
- [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
- [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
@ -486,7 +485,6 @@ in [models](models).
- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
## Examples

View File

@ -17,9 +17,9 @@ import (
// CONSTANTS
const (
srcUrl = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models
srcExt = ".bin" // Filename extension
bufSize = 1024 * 64 // Size of the buffer used for downloading the model
srcUrl = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main" // The location of the models
srcExt = ".bin" // Filename extension
bufSize = 1024 * 64 // Size of the buffer used for downloading the model
)
var (

View File

@ -1,142 +0,0 @@
//
// CoremlEncoder.h
//
// This file was automatically generated and should not be edited.
//
#import <Foundation/Foundation.h>
#import <CoreML/CoreML.h>
#include <stdint.h>
#include <os/log.h>
NS_ASSUME_NONNULL_BEGIN
/// Model Prediction Input Type
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoderInput : NSObject<MLFeatureProvider>
/// melSegment as 1 × 80 × 3000 3-dimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * melSegment;
- (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment NS_DESIGNATED_INITIALIZER;
@end
/// Model Prediction Output Type
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoderOutput : NSObject<MLFeatureProvider>
/// output as multidimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * output;
- (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithOutput:(MLMultiArray *)output NS_DESIGNATED_INITIALIZER;
@end
/// Class for model loading and prediction
API_AVAILABLE(macos(10.15), ios(13.0), watchos(6.0), tvos(13.0)) __attribute__((visibility("hidden")))
@interface CoremlEncoder : NSObject
@property (readonly, nonatomic, nullable) MLModel * model;
/**
URL of the underlying .mlmodelc directory.
*/
+ (nullable NSURL *)URLOfModelInThisBundle;
/**
Initialize CoremlEncoder instance from an existing MLModel object.
Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
*/
- (instancetype)initWithMLModel:(MLModel *)model NS_DESIGNATED_INITIALIZER;
/**
Initialize CoremlEncoder instance with the model in this bundle.
*/
- (nullable instancetype)init;
/**
Initialize CoremlEncoder instance with the model in this bundle.
@param configuration The model configuration object
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Initialize CoremlEncoder instance from the model URL.
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Initialize CoremlEncoder instance from the model URL.
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
@param configuration The model configuration object
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Construct CoremlEncoder instance asynchronously with configuration.
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
@param configuration The model configuration
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
*/
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
/**
Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
@param modelURL The model URL.
@param configuration The model configuration
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
*/
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler API_AVAILABLE(macos(11.0), ios(14.0), watchos(7.0), tvos(14.0)) __attribute__((visibility("hidden")));
/**
Make a prediction using the standard interface
@param input an instance of CoremlEncoderInput to predict from
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@return the prediction as CoremlEncoderOutput
*/
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Make a prediction using the standard interface
@param input an instance of CoremlEncoderInput to predict from
@param options prediction options
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@return the prediction as CoremlEncoderOutput
*/
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Make a prediction using the convenience interface
@param melSegment as 1 × 80 × 3000 3-dimensional array of floats:
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@return the prediction as CoremlEncoderOutput
*/
- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error;
/**
Batch prediction
@param inputArray array of CoremlEncoderInput instances to obtain predictions from
@param options prediction options
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
@return the predictions as NSArray<CoremlEncoderOutput *>
*/
- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error;
@end
NS_ASSUME_NONNULL_END

View File

@ -1,197 +0,0 @@
//
// CoremlEncoder.m
//
// This file was automatically generated and should not be edited.
//
#if !__has_feature(objc_arc)
#error This file must be compiled with automatic reference counting enabled (-fobjc-arc)
#endif
#import "whisper-encoder-impl.h"
@implementation CoremlEncoderInput
- (instancetype)initWithMelSegment:(MLMultiArray *)melSegment {
self = [super init];
if (self) {
_melSegment = melSegment;
}
return self;
}
- (NSSet<NSString *> *)featureNames {
return [NSSet setWithArray:@[@"melSegment"]];
}
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
if ([featureName isEqualToString:@"melSegment"]) {
return [MLFeatureValue featureValueWithMultiArray:self.melSegment];
}
return nil;
}
@end
@implementation CoremlEncoderOutput
- (instancetype)initWithOutput:(MLMultiArray *)output {
self = [super init];
if (self) {
_output = output;
}
return self;
}
- (NSSet<NSString *> *)featureNames {
return [NSSet setWithArray:@[@"output"]];
}
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
if ([featureName isEqualToString:@"output"]) {
return [MLFeatureValue featureValueWithMultiArray:self.output];
}
return nil;
}
@end
@implementation CoremlEncoder
/**
URL of the underlying .mlmodelc directory.
*/
+ (nullable NSURL *)URLOfModelInThisBundle {
NSString *assetPath = [[NSBundle bundleForClass:[self class]] pathForResource:@"CoremlEncoder" ofType:@"mlmodelc"];
if (nil == assetPath) { os_log_error(OS_LOG_DEFAULT, "Could not load CoremlEncoder.mlmodelc in the bundle resource"); return nil; }
return [NSURL fileURLWithPath:assetPath];
}
/**
Initialize CoremlEncoder instance from an existing MLModel object.
Usually the application does not use this initializer unless it makes a subclass of CoremlEncoder.
Such application may want to use `-[MLModel initWithContentsOfURL:configuration:error:]` and `+URLOfModelInThisBundle` to create a MLModel object to pass-in.
*/
- (instancetype)initWithMLModel:(MLModel *)model {
self = [super init];
if (!self) { return nil; }
_model = model;
if (_model == nil) { return nil; }
return self;
}
/**
Initialize CoremlEncoder instance with the model in this bundle.
*/
- (nullable instancetype)init {
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle error:nil];
}
/**
Initialize CoremlEncoder instance with the model in this bundle.
@param configuration The model configuration object
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithConfiguration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
return [self initWithContentsOfURL:(NSURL * _Nonnull)self.class.URLOfModelInThisBundle configuration:configuration error:error];
}
/**
Initialize CoremlEncoder instance from the model URL.
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL error:(NSError * _Nullable __autoreleasing * _Nullable)error {
MLModel *model = [MLModel modelWithContentsOfURL:modelURL error:error];
if (model == nil) { return nil; }
return [self initWithMLModel:model];
}
/**
Initialize CoremlEncoder instance from the model URL.
@param modelURL URL to the .mlmodelc directory for CoremlEncoder.
@param configuration The model configuration object
@param error If an error occurs, upon return contains an NSError object that describes the problem. If you are not interested in possible errors, pass in NULL.
*/
- (nullable instancetype)initWithContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration error:(NSError * _Nullable __autoreleasing * _Nullable)error {
MLModel *model = [MLModel modelWithContentsOfURL:modelURL configuration:configuration error:error];
if (model == nil) { return nil; }
return [self initWithMLModel:model];
}
/**
Construct CoremlEncoder instance asynchronously with configuration.
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
@param configuration The model configuration
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
*/
+ (void)loadWithConfiguration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
[self loadContentsOfURL:(NSURL * _Nonnull)[self URLOfModelInThisBundle]
configuration:configuration
completionHandler:handler];
}
/**
Construct CoremlEncoder instance asynchronously with URL of .mlmodelc directory and optional configuration.
Model loading may take time when the model content is not immediately available (e.g. encrypted model). Use this factory method especially when the caller is on the main thread.
@param modelURL The model URL.
@param configuration The model configuration
@param handler When the model load completes successfully or unsuccessfully, the completion handler is invoked with a valid CoremlEncoder instance or NSError object.
*/
+ (void)loadContentsOfURL:(NSURL *)modelURL configuration:(MLModelConfiguration *)configuration completionHandler:(void (^)(CoremlEncoder * _Nullable model, NSError * _Nullable error))handler {
[MLModel loadContentsOfURL:modelURL
configuration:configuration
completionHandler:^(MLModel *model, NSError *error) {
if (model != nil) {
CoremlEncoder *typedModel = [[CoremlEncoder alloc] initWithMLModel:model];
handler(typedModel, nil);
} else {
handler(nil, error);
}
}];
}
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input error:(NSError * _Nullable __autoreleasing * _Nullable)error {
return [self predictionFromFeatures:input options:[[MLPredictionOptions alloc] init] error:error];
}
- (nullable CoremlEncoderOutput *)predictionFromFeatures:(CoremlEncoderInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
if (!outFeatures) { return nil; }
return [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[outFeatures featureValueForName:@"output"].multiArrayValue];
}
- (nullable CoremlEncoderOutput *)predictionFromMelSegment:(MLMultiArray *)melSegment error:(NSError * _Nullable __autoreleasing * _Nullable)error {
CoremlEncoderInput *input_ = [[CoremlEncoderInput alloc] initWithMelSegment:melSegment];
return [self predictionFromFeatures:input_ error:error];
}
- (nullable NSArray<CoremlEncoderOutput *> *)predictionsFromInputs:(NSArray<CoremlEncoderInput*> *)inputArray options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
id<MLBatchProvider> inBatch = [[MLArrayBatchProvider alloc] initWithFeatureProviderArray:inputArray];
id<MLBatchProvider> outBatch = [self.model predictionsFromBatch:inBatch options:options error:error];
if (!outBatch) { return nil; }
NSMutableArray<CoremlEncoderOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
for (NSInteger i = 0; i < outBatch.count; i++) {
id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
CoremlEncoderOutput * result = [[CoremlEncoderOutput alloc] initWithOutput:(MLMultiArray *)[resultProvider featureValueForName:@"output"].multiArrayValue];
[results addObject:result];
}
return results;
}
@end

View File

@ -1,22 +0,0 @@
// Wrapper of the Core ML Whisper Encoder model
//
// Code is derived from the work of Github user @wangchou
// ref: https://github.com/wangchou/callCoreMLFromCpp
#if __cplusplus
extern "C" {
#endif
struct whisper_coreml_context;
struct whisper_coreml_context * whisper_coreml_init(const char * path_model);
void whisper_coreml_free(struct whisper_coreml_context * ctx);
void whisper_coreml_encode(
const whisper_coreml_context * ctx,
float * mel,
float * out);
#if __cplusplus
}
#endif

View File

@ -1,61 +0,0 @@
#import "coreml/whisper-encoder.h"
#import "coreml/whisper-encoder-impl.h"
#import <CoreML/CoreML.h>
#include <stdlib.h>
#if __cplusplus
extern "C" {
#endif
struct whisper_coreml_context {
const void * data;
};
struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
NSString * path_model_str = [[NSString alloc] initWithUTF8String:path_model];
NSURL * url_model = [NSURL fileURLWithPath: path_model_str];
const void * data = CFBridgingRetain([[CoremlEncoder alloc] initWithContentsOfURL:url_model error:nil]);
if (data == NULL) {
return NULL;
}
whisper_coreml_context * ctx = new whisper_coreml_context;
ctx->data = data;
return ctx;
}
void whisper_coreml_free(struct whisper_coreml_context * ctx) {
CFRelease(ctx->data);
delete ctx;
}
void whisper_coreml_encode(
const whisper_coreml_context * ctx,
float * mel,
float * out) {
MLMultiArray * inMultiArray = [
[MLMultiArray alloc] initWithDataPointer: mel
shape: @[@1, @80, @3000]
dataType: MLMultiArrayDataTypeFloat32
strides: @[@(240000), @(3000), @1]
deallocator: nil
error: nil
];
CoremlEncoderOutput * outCoreML = [(__bridge id) ctx->data predictionFromMelSegment:inMultiArray error:nil];
MLMultiArray * outMA = outCoreML.output;
memcpy(out, outMA.dataPointer, outMA.count * sizeof(float));
}
#if __cplusplus
}
#endif

View File

@ -292,64 +292,51 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
return 0;
}
class Worker : public Napi::AsyncWorker {
public:
Worker(Napi::Function& callback, whisper_params params)
: Napi::AsyncWorker(callback), params(params) {}
void Execute() override {
run(params, result);
}
void OnOK() override {
Napi::HandleScope scope(Env());
Napi::Object res = Napi::Array::New(Env(), result.size());
for (uint64_t i = 0; i < result.size(); ++i) {
Napi::Object tmp = Napi::Array::New(Env(), 3);
for (uint64_t j = 0; j < 3; ++j) {
tmp[j] = Napi::String::New(Env(), result[i][j]);
}
res[i] = tmp;
Napi::Object whisper(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
if (info.Length() <= 0 || !info[0].IsObject()) {
Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
}
Callback().Call({Env().Null(), res});
}
whisper_params params;
std::vector<std::vector<std::string>> result;
private:
whisper_params params;
std::vector<std::vector<std::string>> result;
};
Napi::Object whisper_params = info[0].As<Napi::Object>();
std::string language = whisper_params.Get("language").As<Napi::String>();
std::string model = whisper_params.Get("model").As<Napi::String>();
std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
params.language = language;
params.model = model;
params.fname_inp.emplace_back(input);
// run model
run(params, result);
Napi::Value whisper(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env();
if (info.Length() <= 0 || !info[0].IsObject()) {
Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
}
whisper_params params;
fprintf(stderr, "RESULT:\n");
for (auto sentence:result) {
fprintf(stderr, "t0: %s, t1: %s, content: %s \n",
sentence[0].c_str(), sentence[1].c_str(), sentence[2].c_str());
}
Napi::Object whisper_params = info[0].As<Napi::Object>();
std::string language = whisper_params.Get("language").As<Napi::String>();
std::string model = whisper_params.Get("model").As<Napi::String>();
std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
Napi::Object res = Napi::Array::New(env, result.size());
for (uint64_t i = 0; i < result.size(); ++i) {
Napi::Object tmp = Napi::Array::New(env, 3);
for (uint64_t j = 0; j < 3; ++j) {
tmp[j] = Napi::String::New(env, result[i][j]);
}
res[i] = tmp;
}
params.language = language;
params.model = model;
params.fname_inp.emplace_back(input);
Napi::Function callback = info[1].As<Napi::Function>();
Worker* worker = new Worker(callback, params);
worker->Queue();
return env.Undefined();
return res;
}
Napi::Object Init(Napi::Env env, Napi::Object exports) {
exports.Set(
Napi::String::New(env, "whisper"),
Napi::Function::New(env, whisper)
);
return exports;
exports.Set(
Napi::String::New(env, "whisper"),
Napi::Function::New(env, whisper)
);
return exports;
}
NODE_API_MODULE(whisper, Init);

View File

@ -1,36 +1,27 @@
const path = require("path");
const { whisper } = require(path.join(
__dirname,
"../../build/Release/whisper-addon"
));
const { promisify } = require("util");
const whisperAsync = promisify(whisper);
const path = require('path');
const { whisper } = require(path.join(__dirname, '../../build/Release/whisper-addon'));
const whisperParams = {
language: "en",
model: path.join(__dirname, "../../models/ggml-base.en.bin"),
fname_inp: "../../samples/jfk.wav",
language: 'en',
model: path.join(__dirname, '../../models/ggml-base.en.bin'),
fname_inp: '',
};
const arguments = process.argv.slice(2);
const params = Object.fromEntries(
arguments.reduce((pre, item) => {
if (item.startsWith("--")) {
return [...pre, item.slice(2).split("=")];
}
return pre;
}, [])
arguments.reduce((pre, item) => {
if (item.startsWith("--")) {
return [...pre, item.slice(2).split("=")];
}
return pre;
}, []),
);
for (const key in params) {
if (whisperParams.hasOwnProperty(key)) {
whisperParams[key] = params[key];
}
if (whisperParams.hasOwnProperty(key)) {
whisperParams[key] = params[key];
}
}
console.log("whisperParams =", whisperParams);
whisperAsync(whisperParams).then((result) => {
console.log(`Result from whisper: ${result}`);
});
console.log('whisperParams =', whisperParams);
console.log(whisper(whisperParams));

View File

@ -31,7 +31,6 @@ options:
-osrt, --output-srt [false ] output result in a srt file
-owts, --output-words [false ] output script for generating karaoke video
-ocsv, --output-csv [false ] output result in a CSV file
-oj, --output-json [false ] output result in a JSON file
-of FNAME, --output-file FNAME [ ] output file path (without file extension)
-ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors

View File

@ -73,7 +73,6 @@ struct whisper_params {
bool output_srt = false;
bool output_wts = false;
bool output_csv = false;
bool output_jsn = false;
bool print_special = false;
bool print_colors = false;
bool print_progress = false;
@ -131,7 +130,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
@ -180,7 +178,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false");
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
@ -371,129 +368,6 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
return true;
}
bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
std::ofstream fout(fname);
int indent = 0;
auto doindent = [&]() {
for (int i = 0; i < indent; i++) fout << "\t";
};
auto start_arr = [&](const char *name) {
doindent();
fout << "\"" << name << "\": [\n";
indent++;
};
auto end_arr = [&](bool end = false) {
indent--;
doindent();
fout << (end ? "]\n" : "},\n");
};
auto start_obj = [&](const char *name = nullptr) {
doindent();
if (name) {
fout << "\"" << name << "\": {\n";
} else {
fout << "{\n";
}
indent++;
};
auto end_obj = [&](bool end = false) {
indent--;
doindent();
fout << (end ? "}\n" : "},\n");
};
auto start_value = [&](const char *name) {
doindent();
fout << "\"" << name << "\": ";
};
auto value_s = [&](const char *name, const char *val, bool end = false) {
start_value(name);
fout << "\"" << val << (end ? "\"\n" : "\",\n");
};
auto end_value = [&](bool end = false) {
fout << (end ? "\n" : ",\n");
};
auto value_i = [&](const char *name, const int64_t val, bool end = false) {
start_value(name);
fout << val;
end_value(end);
};
auto value_b = [&](const char *name, const bool val, bool end = false) {
start_value(name);
fout << (val ? "true" : "false");
end_value(end);
};
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
return false;
}
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
start_obj();
value_s("systeminfo", whisper_print_system_info());
start_obj("model");
value_s("type", whisper_model_type_readable(ctx));
value_b("multilingual", whisper_is_multilingual(ctx));
value_i("vocab", whisper_model_n_vocab(ctx));
start_obj("audio");
value_i("ctx", whisper_model_n_audio_ctx(ctx));
value_i("state", whisper_model_n_audio_state(ctx));
value_i("head", whisper_model_n_audio_head(ctx));
value_i("layer", whisper_model_n_audio_layer(ctx), true);
end_obj();
start_obj("text");
value_i("ctx", whisper_model_n_text_ctx(ctx));
value_i("state", whisper_model_n_text_state(ctx));
value_i("head", whisper_model_n_text_head(ctx));
value_i("leyer", whisper_model_n_text_layer(ctx), true);
end_obj();
value_i("mels", whisper_model_n_mels(ctx));
value_i("f16", whisper_model_f16(ctx), true);
end_obj();
start_obj("params");
value_s("model", params.model.c_str());
value_s("language", params.language.c_str());
value_b("translate", params.translate, true);
end_obj();
start_obj("result");
value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
end_obj();
start_arr("transcription");
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
start_obj();
start_obj("timestanps");
value_s("from", to_timestamp(t0, true).c_str());
value_s("to", to_timestamp(t1, true).c_str(), true);
end_obj();
start_obj("offsets");
value_i("from", t0 * 10);
value_i("to", t1 * 10, true);
end_obj();
value_s("text", text, true);
end_obj(i == (n_segments - 1));
}
end_arr(true);
end_obj(true);
return true;
}
// karaoke video generation
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
// TODO: font parameter adjustments
@ -788,12 +662,6 @@ int main(int argc, char ** argv) {
const auto fname_csv = fname_out + ".csv";
output_csv(ctx, fname_csv.c_str());
}
// output to JSON file
if (params.output_jsn) {
const auto fname_jsn = fname_out + ".json";
output_json(ctx, fname_jsn.c_str(), params);
}
}
}

View File

@ -31,7 +31,7 @@ To run this, you will need a ggml GPT-2 model: [instructions](https://github.com
Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:
```
wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
```
## TTS

View File

@ -24,5 +24,3 @@ Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Pha
This can significantly improve the performance of the transcription:
<img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.

View File

@ -296,10 +296,6 @@
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
MTL_ENABLE_DEBUG_INFO = NO;
MTL_FAST_MATH = YES;
OTHER_CFLAGS = (
"-O3",
"-DNDEBUG",
);
SDKROOT = iphoneos;
VALIDATE_PRODUCT = YES;
};

View File

@ -7,9 +7,8 @@ To use:
2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device.
5. Select the "release" build configuration under "Run", then deploy and run to your device.
[^1]: I recommend the tiny, base or small models for running on an iOS device.
[^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)

View File

@ -430,10 +430,6 @@
LLVM_LTO = YES;
MACOSX_DEPLOYMENT_TARGET = 13.0;
MARKETING_VERSION = 1.0;
OTHER_CFLAGS = (
"-O3",
"-DNDEBUG",
);
PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
PRODUCT_NAME = "$(TARGET_NAME)";
SDKROOT = auto;

2
ggml.c
View File

@ -79,7 +79,7 @@ typedef void* thread_ret_t;
#define static_assert(cond, msg) _Static_assert(cond, msg)
#endif
/*#define GGML_PERF*/
#define GGML_PERF
#define GGML_DEBUG 0
#define GGML_GELU_FP16

View File

@ -6,7 +6,7 @@ using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either
the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
script to download the already converted models. Currently, they are hosted on the following locations:
- https://huggingface.co/ggerganov/whisper.cpp
- https://huggingface.co/datasets/ggerganov/whisper.cpp
- https://ggml.ggerganov.com
Sample usage:
@ -23,7 +23,7 @@ You can now use it like this:
A third option to obtain the model files is to download them from Hugging Face:
https://huggingface.co/ggerganov/whisper.cpp/tree/main
https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
## Available models

View File

@ -79,11 +79,11 @@ dir_model = sys.argv[1]
dir_whisper = sys.argv[2]
dir_out = sys.argv[3]
with open(dir_model + "/vocab.json", "r", encoding="utf8") as f:
with open(dir_model + "/vocab.json", "r") as f:
encoder = json.load(f)
with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f:
with open(dir_model + "/added_tokens.json", "r") as f:
encoder_added = json.load(f)
with open(dir_model + "/config.json", "r", encoding="utf8") as f:
with open(dir_model + "/config.json", "r") as f:
hparams = json.load(f)
model = WhisperForConditionalGeneration.from_pretrained(dir_model)

View File

@ -1,82 +0,0 @@
#!/bin/bash
# This script downloads Whisper model files that have already been converted to Core ML format.
# This way you don't have to convert them yourself.
src="https://huggingface.co/datasets/ggerganov/whisper.cpp-coreml"
pfx="resolve/main/ggml"
# get the path of this script
function get_script_path() {
if [ -x "$(command -v realpath)" ]; then
echo "$(dirname $(realpath $0))"
else
local ret="$(cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P)"
echo "$ret"
fi
}
models_path="$(get_script_path)"
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
# list available models
function list_models {
printf "\n"
printf " Available models:"
for model in "${models[@]}"; do
printf " $model"
done
printf "\n\n"
}
if [ "$#" -ne 1 ]; then
printf "Usage: $0 <model>\n"
list_models
exit 1
fi
model=$1
if [[ ! " ${models[@]} " =~ " ${model} " ]]; then
printf "Invalid model: $model\n"
list_models
exit 1
fi
# download Core ML model
printf "Downloading Core ML model $model from '$src' ...\n"
cd $models_path
if [ -f "ggml-$model.mlmodel" ]; then
printf "Model $model already exists. Skipping download.\n"
exit 0
fi
if [ -x "$(command -v wget)" ]; then
wget --quiet --show-progress -O ggml-$model.mlmodel $src/$pfx-$model.mlmodel
elif [ -x "$(command -v curl)" ]; then
curl -L --output ggml-$model.mlmodel $src/$pfx-$model.mlmodel
else
printf "Either wget or curl is required to download models.\n"
exit 1
fi
if [ $? -ne 0 ]; then
printf "Failed to download Core ML model $model \n"
printf "Please try again later or download the original Whisper model files and convert them yourself.\n"
exit 1
fi
printf "Done! Model '$model' saved in 'models/ggml-$model.mlmodel'\n"
printf "Run the following command to compile it:\n\n"
printf " $ xcrun coremlc compile ./models/ggml-$model.mlmodel ./models\n\n"
printf "You can now use it like this:\n\n"
printf " $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
printf "\n"

View File

@ -40,7 +40,7 @@ if exist "ggml-%model%.bin" (
goto :eof
)
PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
if %ERRORLEVEL% neq 0 (
echo Failed to download ggml model %model%

View File

@ -6,7 +6,7 @@
#src="https://ggml.ggerganov.com"
#pfx="ggml-model-whisper"
src="https://huggingface.co/ggerganov/whisper.cpp"
src="https://huggingface.co/datasets/ggerganov/whisper.cpp"
pfx="resolve/main/ggml"
# get the path of this script

View File

@ -1,8 +1,5 @@
#define WHISPER_BUILD
#include "whisper.h"
#if WHISPER_USE_COREML
#include "coreml/whisper-encoder.h"
#endif
#include "ggml.h"
@ -589,10 +586,6 @@ struct whisper_state {
int lang_id = 0; // english by default
#ifdef WHISPER_USE_COREML
whisper_coreml_context * ctx_coreml;
#endif
// [EXPERIMENTAL] token-level timestamps data
int64_t t_beg = 0;
int64_t t_last = 0;
@ -638,13 +631,12 @@ struct whisper_context {
int64_t t_load_us = 0;
int64_t t_start_us = 0;
ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16)
whisper_model model;
whisper_vocab vocab;
whisper_state * state = nullptr;
std::string path_model; // populated by whisper_init_from_file()
};
template<typename T>
@ -1375,7 +1367,6 @@ static bool whisper_encode_internal(
}
}
#ifndef WHISPER_USE_COREML
struct ggml_tensor * cur;
// convolution + gelu
@ -1418,7 +1409,7 @@ static bool whisper_encode_internal(
//}
static int iter = 0;
const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
@ -1607,7 +1598,7 @@ static bool whisper_encode_internal(
ggml_repeat(ctx0, layer.mlp_ln_w, cur),
cur),
ggml_repeat(ctx0, layer.mlp_ln_b, cur));
}
}
#ifdef WHISPER_USE_FLASH_FF
wstate.use_buf(ctx0, 0);
@ -1647,7 +1638,7 @@ static bool whisper_encode_internal(
ggml_repeat(ctx0, layer.mlp_1_b, cur),
cur);
#endif
}
}
wstate.use_buf(ctx0, 3);
@ -1684,13 +1675,6 @@ static bool whisper_encode_internal(
//ggml_graph_print(&gf);
}
#else
wstate.use_buf(ctx0, -1);
struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
#endif
// cur
//{
@ -2176,6 +2160,12 @@ static bool whisper_decode_internal(
ggml_graph_compute (ctx0, &gf);
}
// print the time for computing the last ggml_mul_mat that computes logits
// also print the total decoder time
// these need to be called after ggml_graph_compute()
printf("logits t = %7.3f ms (%2d runs, N = %3d, ggml_mul_mat: [%d x %d] * [%d x %d])\n", 1e-3*double(logits->perf_time_us)/logits->perf_runs, logits->perf_runs, N, logits->ne[0], logits->ne[1], cur->ne[1], cur->ne[0]);
printf("total t = %7.3f ms (%2d runs)\n", 1e-3*double(gf.perf_time_us)/gf.perf_runs, gf.perf_runs);
// extract logits for all N tokens
//logits_out.resize(N*n_vocab);
//memcpy(logits_out.data(), ggml_get_data(logits), sizeof(float)*N*n_vocab);
@ -2494,25 +2484,12 @@ static std::vector<whisper_vocab::id> tokenize(const whisper_vocab & vocab, cons
// interface implementation
//
#ifdef WHISPER_USE_COREML
// replace .bin with .mlmodelc
static std::string whisper_get_coreml_path(std::string path_bin) {
auto pos = path_bin.rfind('.');
if (pos != std::string::npos) {
path_bin = path_bin.substr(0, pos);
}
path_bin += ".mlmodelc";
return path_bin;
}
#endif
struct whisper_state * whisper_init_state(whisper_context * ctx) {
whisper_state * state = new whisper_state;
const size_t scale = ctx->model.hparams.f16 ? 1 : 2;
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
return nullptr;
@ -2533,20 +2510,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
}
#ifdef WHISPER_USE_COREML
const auto path_coreml = whisper_get_coreml_path(ctx->path_model);
fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
if (!state->ctx_coreml) {
fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
return nullptr;
}
fprintf(stderr, "%s: Core ML model loaded\n", __func__);
#endif
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
@ -2582,7 +2545,6 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
}
loader.context = &fin;
loader.read = [](void * ctx, void * output, size_t read_size) {
std::ifstream * fin = (std::ifstream*)ctx;
fin->read((char *)output, read_size);
@ -2599,13 +2561,7 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
fin->close();
};
auto ctx = whisper_init_no_state(&loader);
if (ctx) {
ctx->path_model = path_model;
}
return ctx;
return whisper_init_no_state(&loader);
}
struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size) {
@ -2730,10 +2686,6 @@ void whisper_free(struct whisper_context * ctx) {
whisper_free_state(ctx->state);
#ifdef WHISPER_USE_COREML
whisper_coreml_free(ctx->state->ctx_coreml);
ctx->state->ctx_coreml = nullptr;
#endif
delete ctx;
}
}
@ -2910,7 +2862,7 @@ int whisper_lang_auto_detect_with_state(
}
// run the encoder
if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
if (whisper_encode(ctx, seek, n_threads) != 0) {
fprintf(stderr, "%s: failed to encode\n", __func__);
return -6;
}
@ -2974,71 +2926,6 @@ int whisper_lang_auto_detect(
return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
}
int whisper_model_n_vocab(struct whisper_context * ctx) {
return ctx->model.hparams.n_vocab;
}
int whisper_model_n_audio_ctx(struct whisper_context * ctx) {
return ctx->model.hparams.n_audio_ctx;
}
int whisper_model_n_audio_state(struct whisper_context * ctx) {
return ctx->model.hparams.n_audio_state;
}
int whisper_model_n_audio_head(struct whisper_context * ctx) {
return ctx->model.hparams.n_audio_head;
}
int whisper_model_n_audio_layer(struct whisper_context * ctx) {
return ctx->model.hparams.n_audio_layer;
}
int whisper_model_n_text_ctx(struct whisper_context * ctx) {
return ctx->model.hparams.n_text_ctx;
}
int whisper_model_n_text_state(struct whisper_context * ctx) {
return ctx->model.hparams.n_text_state;
}
int whisper_model_n_text_head(struct whisper_context * ctx) {
return ctx->model.hparams.n_text_head;
}
int whisper_model_n_text_layer(struct whisper_context * ctx) {
return ctx->model.hparams.n_text_layer;
}
int whisper_model_n_mels(struct whisper_context * ctx) {
return ctx->model.hparams.n_mels;
}
int whisper_model_f16(struct whisper_context * ctx) {
return ctx->model.hparams.f16;
}
int whisper_model_type(struct whisper_context * ctx) {
return ctx->model.type;
}
const char *whisper_model_type_readable(struct whisper_context * ctx) {
switch (ctx->model.type) {
case e_model::MODEL_TINY:
return "tiny";
case e_model::MODEL_BASE:
return "base";
case e_model::MODEL_SMALL:
return "small";
case e_model::MODEL_MEDIUM:
return "medium";
case e_model::MODEL_LARGE:
return "large";
default:
return "unknown";
}
}
int whisper_n_len_from_state(struct whisper_state * state) {
return state->mel.n_len;
}
@ -3201,7 +3088,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.max_initial_ts =*/ 1.0f,
/*.length_penalty =*/ -1.0f,
/*.temperature_inc =*/ 0.0f, // TODO: temporary disabled until improve performance
/*.temperature_inc =*/ 0.2f,
/*.entropy_thold =*/ 2.4f,
/*.logprob_thold =*/ -1.0f,
/*.no_speech_thold =*/ 0.6f,

View File

@ -248,19 +248,6 @@ extern "C" {
WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
WHISPER_API int whisper_model_f16 (struct whisper_context * ctx);
WHISPER_API int whisper_model_type (struct whisper_context * ctx);
// Token logits obtained from the last call to whisper_decode()
// The logits for the last token are stored in the last row
// Rows: n_tokens
@ -270,8 +257,6 @@ extern "C" {
// Token Id -> String. Uses the vocabulary in the provided context
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
// Special tokens
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);