mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-04-18 08:18:42 +02:00
whisper.objc : add real-time processing (#97)
Similar to the "stream" app
This commit is contained in:
parent
c207eed431
commit
e266cb0723
@ -1,8 +1,8 @@
|
|||||||
<?xml version="1.0" encoding="UTF-8"?>
|
<?xml version="1.0" encoding="UTF-8"?>
|
||||||
<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
|
<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
|
||||||
<device id="retina6_0" orientation="portrait" appearance="light"/>
|
<device id="retina6_0" orientation="portrait" appearance="light"/>
|
||||||
<dependencies>
|
<dependencies>
|
||||||
<plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
|
<plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
|
||||||
<capability name="Safe area layout guides" minToolsVersion="9.0"/>
|
<capability name="Safe area layout guides" minToolsVersion="9.0"/>
|
||||||
<capability name="System colors in document resources" minToolsVersion="11.0"/>
|
<capability name="System colors in document resources" minToolsVersion="11.0"/>
|
||||||
<capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
|
<capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
|
||||||
@ -40,7 +40,7 @@
|
|||||||
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
|
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
|
||||||
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
|
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
|
||||||
<color key="textColor" systemColor="labelColor"/>
|
<color key="textColor" systemColor="labelColor"/>
|
||||||
<fontDescription key="fontDescription" type="system" pointSize="20"/>
|
<fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
|
||||||
<textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
|
<textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
|
||||||
</textView>
|
</textView>
|
||||||
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
|
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
|
||||||
@ -56,6 +56,18 @@
|
|||||||
<action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
|
<action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
|
||||||
</connections>
|
</connections>
|
||||||
</button>
|
</button>
|
||||||
|
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
|
||||||
|
<rect key="frame" x="199" y="191" width="156" height="49"/>
|
||||||
|
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
|
||||||
|
<color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
|
||||||
|
<color key="tintColor" systemColor="opaqueSeparatorColor"/>
|
||||||
|
<state key="normal" title="Real-time">
|
||||||
|
<color key="titleColor" systemColor="labelColor"/>
|
||||||
|
</state>
|
||||||
|
<connections>
|
||||||
|
<action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
|
||||||
|
</connections>
|
||||||
|
</button>
|
||||||
</subviews>
|
</subviews>
|
||||||
<viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
|
<viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
|
||||||
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
|
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
|
||||||
@ -64,6 +76,7 @@
|
|||||||
</constraints>
|
</constraints>
|
||||||
</view>
|
</view>
|
||||||
<connections>
|
<connections>
|
||||||
|
<outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
|
||||||
<outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
|
<outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
|
||||||
<outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
|
<outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
|
||||||
<outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
|
<outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
|
||||||
|
@ -20,6 +20,8 @@ typedef struct
|
|||||||
{
|
{
|
||||||
int ggwaveId;
|
int ggwaveId;
|
||||||
bool isCapturing;
|
bool isCapturing;
|
||||||
|
bool isTranscribing;
|
||||||
|
bool isRealtime;
|
||||||
UILabel * labelReceived;
|
UILabel * labelReceived;
|
||||||
|
|
||||||
AudioQueueRef queue;
|
AudioQueueRef queue;
|
||||||
@ -31,6 +33,8 @@ typedef struct
|
|||||||
float * audioBufferF32;
|
float * audioBufferF32;
|
||||||
|
|
||||||
struct whisper_context * ctx;
|
struct whisper_context * ctx;
|
||||||
|
|
||||||
|
void * vc;
|
||||||
} StateInp;
|
} StateInp;
|
||||||
|
|
||||||
@interface ViewController : UIViewController
|
@interface ViewController : UIViewController
|
||||||
|
@ -24,6 +24,7 @@ void AudioInputCallback(void * inUserData,
|
|||||||
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
|
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
|
||||||
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
|
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
|
||||||
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
|
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
|
||||||
|
@property (weak, nonatomic) IBOutlet UIButton *buttonRealtime;
|
||||||
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;
|
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;
|
||||||
|
|
||||||
@end
|
@end
|
||||||
@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
|
|||||||
stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
|
stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
|
||||||
stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
|
stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
stateInp.isTranscribing = false;
|
||||||
|
stateInp.isRealtime = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
-(IBAction) stopCapturing {
|
-(IBAction) stopCapturing {
|
||||||
@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
|
|||||||
NSLog(@"Start capturing");
|
NSLog(@"Start capturing");
|
||||||
|
|
||||||
stateInp.n_samples = 0;
|
stateInp.n_samples = 0;
|
||||||
|
stateInp.vc = (__bridge void *)(self);
|
||||||
|
|
||||||
OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
|
OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
|
||||||
AudioInputCallback,
|
AudioInputCallback,
|
||||||
@ -141,67 +146,101 @@ void AudioInputCallback(void * inUserData,
|
|||||||
- (IBAction)onTranscribePrepare:(id)sender {
|
- (IBAction)onTranscribePrepare:(id)sender {
|
||||||
_textviewResult.text = @"Processing - please wait ...";
|
_textviewResult.text = @"Processing - please wait ...";
|
||||||
|
|
||||||
if (stateInp.isCapturing) {
|
if (stateInp.isRealtime) {
|
||||||
// stop capturing
|
[self onRealtime:(id)sender];
|
||||||
[self stopCapturing];
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (stateInp.isCapturing) {
|
||||||
|
[self stopCapturing];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
- (IBAction)onRealtime:(id)sender {
|
||||||
|
stateInp.isRealtime = !stateInp.isRealtime;
|
||||||
|
|
||||||
|
if (stateInp.isRealtime) {
|
||||||
|
[_buttonRealtime setBackgroundColor:[UIColor greenColor]];
|
||||||
|
} else {
|
||||||
|
[_buttonRealtime setBackgroundColor:[UIColor grayColor]];
|
||||||
|
}
|
||||||
|
|
||||||
|
NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
|
||||||
}
|
}
|
||||||
|
|
||||||
- (IBAction)onTranscribe:(id)sender {
|
- (IBAction)onTranscribe:(id)sender {
|
||||||
NSLog(@"Processing %d samples", stateInp.n_samples);
|
if (stateInp.isTranscribing) {
|
||||||
|
|
||||||
// process captured audio
|
|
||||||
// convert I16 to F32
|
|
||||||
for (int i = 0; i < stateInp.n_samples; i++) {
|
|
||||||
stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
|
|
||||||
}
|
|
||||||
|
|
||||||
// run the model
|
|
||||||
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
|
||||||
|
|
||||||
params.print_realtime = true;
|
|
||||||
params.print_progress = false;
|
|
||||||
params.print_timestamps = true;
|
|
||||||
params.print_special = false;
|
|
||||||
params.translate = false;
|
|
||||||
params.language = "en";
|
|
||||||
params.n_threads = 4;
|
|
||||||
params.offset_ms = 0;
|
|
||||||
|
|
||||||
CFTimeInterval startTime = CACurrentMediaTime();
|
|
||||||
|
|
||||||
if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
|
|
||||||
NSLog(@"Failed to run the model");
|
|
||||||
_textviewResult.text = @"Failed to run the model";
|
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
CFTimeInterval endTime = CACurrentMediaTime();
|
NSLog(@"Processing %d samples", stateInp.n_samples);
|
||||||
|
|
||||||
// clear the text in the textview
|
stateInp.isTranscribing = true;
|
||||||
_textviewResult.text = @"";
|
|
||||||
|
|
||||||
int n_segments = whisper_full_n_segments(stateInp.ctx);
|
// dispatch the model to a background thread
|
||||||
for (int i = 0; i < n_segments; i++) {
|
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
|
||||||
const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
|
// process captured audio
|
||||||
|
// convert I16 to F32
|
||||||
|
for (int i = 0; i < self->stateInp.n_samples; i++) {
|
||||||
|
self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
|
||||||
|
}
|
||||||
|
|
||||||
// append the text to the textview
|
// run the model
|
||||||
_textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
|
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||||
}
|
|
||||||
|
|
||||||
// internal model timing
|
// get maximum number of threads on this device (max 8)
|
||||||
whisper_print_timings(stateInp.ctx);
|
const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
|
||||||
|
|
||||||
NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
|
params.print_realtime = true;
|
||||||
|
params.print_progress = false;
|
||||||
|
params.print_timestamps = true;
|
||||||
|
params.print_special = false;
|
||||||
|
params.translate = false;
|
||||||
|
params.language = "en";
|
||||||
|
params.n_threads = max_threads;
|
||||||
|
params.offset_ms = 0;
|
||||||
|
params.single_segment = self->stateInp.isRealtime;
|
||||||
|
|
||||||
_textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
|
CFTimeInterval startTime = CACurrentMediaTime();
|
||||||
|
|
||||||
|
whisper_reset_timings(self->stateInp.ctx);
|
||||||
|
|
||||||
|
if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
|
||||||
|
NSLog(@"Failed to run the model");
|
||||||
|
self->_textviewResult.text = @"Failed to run the model";
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
whisper_print_timings(self->stateInp.ctx);
|
||||||
|
|
||||||
|
CFTimeInterval endTime = CACurrentMediaTime();
|
||||||
|
|
||||||
|
NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
|
||||||
|
|
||||||
|
// result text
|
||||||
|
NSString *result = @"";
|
||||||
|
|
||||||
|
int n_segments = whisper_full_n_segments(self->stateInp.ctx);
|
||||||
|
for (int i = 0; i < n_segments; i++) {
|
||||||
|
const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
|
||||||
|
|
||||||
|
// append the text to the result
|
||||||
|
result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
|
||||||
|
}
|
||||||
|
|
||||||
|
// append processing time
|
||||||
|
result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
|
||||||
|
|
||||||
|
// dispatch the result to the main thread
|
||||||
|
dispatch_async(dispatch_get_main_queue(), ^{
|
||||||
|
self->_textviewResult.text = result;
|
||||||
|
self->stateInp.isTranscribing = false;
|
||||||
|
});
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// Callback implmentation
|
// Callback implementation
|
||||||
//
|
//
|
||||||
|
|
||||||
void AudioInputCallback(void * inUserData,
|
void AudioInputCallback(void * inUserData,
|
||||||
@ -224,6 +263,12 @@ void AudioInputCallback(void * inUserData,
|
|||||||
|
|
||||||
if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
|
if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
|
||||||
NSLog(@"Too much audio data, ignoring");
|
NSLog(@"Too much audio data, ignoring");
|
||||||
|
|
||||||
|
dispatch_async(dispatch_get_main_queue(), ^{
|
||||||
|
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
|
||||||
|
[vc stopCapturing];
|
||||||
|
});
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -235,6 +280,14 @@ void AudioInputCallback(void * inUserData,
|
|||||||
|
|
||||||
// put the buffer back in the queue
|
// put the buffer back in the queue
|
||||||
AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
|
AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
|
||||||
|
|
||||||
|
if (stateInp->isRealtime) {
|
||||||
|
// dipatch onTranscribe() to the main thread
|
||||||
|
dispatch_async(dispatch_get_main_queue(), ^{
|
||||||
|
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
|
||||||
|
[vc onTranscribe:nil];
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@end
|
@end
|
||||||
|
32
whisper.cpp
32
whisper.cpp
@ -2386,6 +2386,21 @@ void whisper_reset_timings(struct whisper_context * ctx) {
|
|||||||
ctx->t_decode_us = 0;
|
ctx->t_decode_us = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * whisper_print_system_info(void) {
|
||||||
|
static std::string s;
|
||||||
|
|
||||||
|
s = "";
|
||||||
|
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
||||||
|
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
||||||
|
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
||||||
|
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
||||||
|
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||||
|
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||||
|
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||||
|
|
||||||
|
return s.c_str();
|
||||||
|
}
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
|
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
|
||||||
@ -2863,7 +2878,7 @@ int whisper_full_parallel(
|
|||||||
struct whisper_full_params params,
|
struct whisper_full_params params,
|
||||||
const float * samples,
|
const float * samples,
|
||||||
int n_samples,
|
int n_samples,
|
||||||
const int n_processors) {
|
int n_processors) {
|
||||||
if (n_processors == 1) {
|
if (n_processors == 1) {
|
||||||
return whisper_full(ctx, params, samples, n_samples);
|
return whisper_full(ctx, params, samples, n_samples);
|
||||||
}
|
}
|
||||||
@ -3040,21 +3055,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
|
|||||||
return ctx->result_all[i_segment].tokens[i_token].p;
|
return ctx->result_all[i_segment].tokens[i_token].p;
|
||||||
}
|
}
|
||||||
|
|
||||||
const char * whisper_print_system_info(void) {
|
|
||||||
static std::string s;
|
|
||||||
|
|
||||||
s = "";
|
|
||||||
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
|
|
||||||
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
|
|
||||||
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
|
|
||||||
s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
|
|
||||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
|
||||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
|
||||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
|
||||||
|
|
||||||
return s.c_str();
|
|
||||||
}
|
|
||||||
|
|
||||||
// =================================================================================================
|
// =================================================================================================
|
||||||
|
|
||||||
//
|
//
|
||||||
|
70
whisper.h
70
whisper.h
@ -72,16 +72,16 @@ extern "C" {
|
|||||||
whisper_token id; // token id
|
whisper_token id; // token id
|
||||||
whisper_token tid; // forced timestamp token id
|
whisper_token tid; // forced timestamp token id
|
||||||
|
|
||||||
float p; // probability of the token
|
float p; // probability of the token
|
||||||
float pt; // probability of the timestamp token
|
float pt; // probability of the timestamp token
|
||||||
float ptsum; // sum of probabilities of all timestamp tokens
|
float ptsum; // sum of probabilities of all timestamp tokens
|
||||||
|
|
||||||
// token-level timestamp data
|
// token-level timestamp data
|
||||||
// do not use if you haven't computed token-level timestamps
|
// do not use if you haven't computed token-level timestamps
|
||||||
int64_t t0; // start time of the token
|
int64_t t0; // start time of the token
|
||||||
int64_t t1; // end time of the token
|
int64_t t1; // end time of the token
|
||||||
|
|
||||||
float vlen; // voice length of the token
|
float vlen; // voice length of the token
|
||||||
} whisper_token_data;
|
} whisper_token_data;
|
||||||
|
|
||||||
// Allocates all memory needed for the model and loads the model from the given file.
|
// Allocates all memory needed for the model and loads the model from the given file.
|
||||||
@ -96,9 +96,9 @@ extern "C" {
|
|||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
WHISPER_API int whisper_pcm_to_mel(
|
WHISPER_API int whisper_pcm_to_mel(
|
||||||
struct whisper_context * ctx,
|
struct whisper_context * ctx,
|
||||||
const float * samples,
|
const float * samples,
|
||||||
int n_samples,
|
int n_samples,
|
||||||
int n_threads);
|
int n_threads);
|
||||||
|
|
||||||
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
|
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
|
||||||
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
|
||||||
@ -106,9 +106,9 @@ extern "C" {
|
|||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
WHISPER_API int whisper_set_mel(
|
WHISPER_API int whisper_set_mel(
|
||||||
struct whisper_context * ctx,
|
struct whisper_context * ctx,
|
||||||
const float * data,
|
const float * data,
|
||||||
int n_len,
|
int n_len,
|
||||||
int n_mel);
|
int n_mel);
|
||||||
|
|
||||||
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
|
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
|
||||||
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
|
||||||
@ -116,8 +116,8 @@ extern "C" {
|
|||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
WHISPER_API int whisper_encode(
|
WHISPER_API int whisper_encode(
|
||||||
struct whisper_context * ctx,
|
struct whisper_context * ctx,
|
||||||
int offset,
|
int offset,
|
||||||
int n_threads);
|
int n_threads);
|
||||||
|
|
||||||
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
|
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
|
||||||
// Make sure to call whisper_encode() first.
|
// Make sure to call whisper_encode() first.
|
||||||
@ -126,10 +126,10 @@ extern "C" {
|
|||||||
// Returns 0 on success
|
// Returns 0 on success
|
||||||
WHISPER_API int whisper_decode(
|
WHISPER_API int whisper_decode(
|
||||||
struct whisper_context * ctx,
|
struct whisper_context * ctx,
|
||||||
const whisper_token * tokens,
|
const whisper_token * tokens,
|
||||||
int n_tokens,
|
int n_tokens,
|
||||||
int n_past,
|
int n_past,
|
||||||
int n_threads);
|
int n_threads);
|
||||||
|
|
||||||
// Token sampling methods.
|
// Token sampling methods.
|
||||||
// These are provided for convenience and can be used after each call to whisper_decode().
|
// These are provided for convenience and can be used after each call to whisper_decode().
|
||||||
@ -169,6 +169,9 @@ extern "C" {
|
|||||||
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
|
||||||
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
|
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
|
||||||
|
|
||||||
|
// Print system information
|
||||||
|
WHISPER_API const char * whisper_print_system_info(void);
|
||||||
|
|
||||||
////////////////////////////////////////////////////////////////////////////
|
////////////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
// Available sampling strategies
|
// Available sampling strategies
|
||||||
@ -187,12 +190,12 @@ extern "C" {
|
|||||||
|
|
||||||
int n_threads;
|
int n_threads;
|
||||||
int n_max_text_ctx;
|
int n_max_text_ctx;
|
||||||
int offset_ms; // start offset in ms
|
int offset_ms; // start offset in ms
|
||||||
int duration_ms; // audio duration to process in ms
|
int duration_ms; // audio duration to process in ms
|
||||||
|
|
||||||
bool translate;
|
bool translate;
|
||||||
bool no_context;
|
bool no_context;
|
||||||
bool single_segment; // force single segment output (useful for streaming)
|
bool single_segment; // force single segment output (useful for streaming)
|
||||||
bool print_special;
|
bool print_special;
|
||||||
bool print_progress;
|
bool print_progress;
|
||||||
bool print_realtime;
|
bool print_realtime;
|
||||||
@ -206,8 +209,8 @@ extern "C" {
|
|||||||
int max_tokens; // max tokens per segment (0 = no limit)
|
int max_tokens; // max tokens per segment (0 = no limit)
|
||||||
|
|
||||||
// [EXPERIMENTAL] speed-up techniques
|
// [EXPERIMENTAL] speed-up techniques
|
||||||
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
||||||
int audio_ctx; // overwrite the audio context size (0 = use default)
|
int audio_ctx; // overwrite the audio context size (0 = use default)
|
||||||
|
|
||||||
// tokens to provide the whisper model as initial prompt
|
// tokens to provide the whisper model as initial prompt
|
||||||
// these are prepended to any existing text context from a previous call
|
// these are prepended to any existing text context from a previous call
|
||||||
@ -235,20 +238,20 @@ extern "C" {
|
|||||||
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||||
// Uses the specified decoding strategy to obtain the text.
|
// Uses the specified decoding strategy to obtain the text.
|
||||||
WHISPER_API int whisper_full(
|
WHISPER_API int whisper_full(
|
||||||
struct whisper_context * ctx,
|
struct whisper_context * ctx,
|
||||||
struct whisper_full_params params,
|
struct whisper_full_params params,
|
||||||
const float * samples,
|
const float * samples,
|
||||||
int n_samples);
|
int n_samples);
|
||||||
|
|
||||||
// Split the input audio in chunks and process each chunk separately using whisper_full()
|
// Split the input audio in chunks and process each chunk separately using whisper_full()
|
||||||
// It seems this approach can offer some speedup in some cases.
|
// It seems this approach can offer some speedup in some cases.
|
||||||
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
||||||
WHISPER_API int whisper_full_parallel(
|
WHISPER_API int whisper_full_parallel(
|
||||||
struct whisper_context * ctx,
|
struct whisper_context * ctx,
|
||||||
struct whisper_full_params params,
|
struct whisper_full_params params,
|
||||||
const float * samples,
|
const float * samples,
|
||||||
int n_samples,
|
int n_samples,
|
||||||
const int n_processors);
|
int n_processors);
|
||||||
|
|
||||||
// Number of generated text segments.
|
// Number of generated text segments.
|
||||||
// A segment can be a few words, a sentence, or even a paragraph.
|
// A segment can be a few words, a sentence, or even a paragraph.
|
||||||
@ -275,9 +278,6 @@ extern "C" {
|
|||||||
// Get the probability of the specified token in the specified segment.
|
// Get the probability of the specified token in the specified segment.
|
||||||
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
|
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
|
||||||
|
|
||||||
// Print system information
|
|
||||||
WHISPER_API const char * whisper_print_system_info(void);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user