whisper.objc : add real-time processing (#97)

Similar to the "stream" app
2025-08-09 07:32:16 +02:00 · 2022-11-26 17:28:28 +02:00
parent c207eed431
commit e266cb0723
5 changed files with 169 additions and 99 deletions
--- a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
+++ b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
@ -1,8 +1,8 @@
 <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
    <device id="retina6_0" orientation="portrait" appearance="light"/>
    <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
        <capability name="Safe area layout guides" minToolsVersion="9.0"/>
        <capability name="System colors in document resources" minToolsVersion="11.0"/>
        <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
@ -40,7 +40,7 @@
                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                <color key="backgroundColor" systemColor="systemBackgroundColor"/>
                                <color key="textColor" systemColor="labelColor"/>
-                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
+                                <fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
                                <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                            </textView>
                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
@ -56,6 +56,18 @@
                                    <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
                                </connections>
                            </button>
                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
                                <rect key="frame" x="199" y="191" width="156" height="49"/>
                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
                                <color key="tintColor" systemColor="opaqueSeparatorColor"/>
                                <state key="normal" title="Real-time">
                                    <color key="titleColor" systemColor="labelColor"/>
                                </state>
                                <connections>
                                    <action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
                                </connections>
                            </button>
                        </subviews>
                        <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                        <color key="backgroundColor" systemColor="systemBackgroundColor"/>
@ -64,6 +76,7 @@
                        </constraints>
                    </view>
                    <connections>
                        <outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
                        <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
                        <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
                        <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
--- a/examples/whisper.objc/whisper.objc/ViewController.h
+++ b/examples/whisper.objc/whisper.objc/ViewController.h
@ -20,6 +20,8 @@ typedef struct
 {
    int ggwaveId;
    bool isCapturing;
    bool isTranscribing;
    bool isRealtime;
    UILabel * labelReceived;
    AudioQueueRef queue;
@ -31,6 +33,8 @@ typedef struct
    float   * audioBufferF32;
    struct whisper_context * ctx;
    void * vc;
 } StateInp;
@interface ViewController : UIViewController
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@ -24,6 +24,7 @@ void AudioInputCallback(void * inUserData,
@property (weak, nonatomic) IBOutlet UILabel    *labelStatusInp;
@property (weak, nonatomic) IBOutlet UIButton   *buttonToggleCapture;
@property (weak, nonatomic) IBOutlet UIButton   *buttonTranscribe;
@property (weak, nonatomic) IBOutlet UIButton   *buttonRealtime;
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;
@end
@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
        stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
        stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
    }
    stateInp.isTranscribing = false;
    stateInp.isRealtime = false;
 }
 -(IBAction) stopCapturing {
@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
    NSLog(@"Start capturing");
    stateInp.n_samples = 0;
    stateInp.vc = (__bridge void *)(self);
    OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
                                         AudioInputCallback,
@ -141,67 +146,101 @@ void AudioInputCallback(void * inUserData,
 - (IBAction)onTranscribePrepare:(id)sender {
    _textviewResult.text = @"Processing - please wait ...";
-    if (stateInp.isCapturing) {
+    if (stateInp.isRealtime) {
-        // stop capturing
+        [self onRealtime:(id)sender];
        [self stopCapturing];
        return;
    }
    if (stateInp.isCapturing) {
        [self stopCapturing];
    }
 }
 - (IBAction)onRealtime:(id)sender {
    stateInp.isRealtime = !stateInp.isRealtime;
    if (stateInp.isRealtime) {
        [_buttonRealtime setBackgroundColor:[UIColor greenColor]];
    } else {
        [_buttonRealtime setBackgroundColor:[UIColor grayColor]];
    }
    NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
 }
 - (IBAction)onTranscribe:(id)sender {
-    NSLog(@"Processing %d samples", stateInp.n_samples);
+    if (stateInp.isTranscribing) {
    // process captured audio
    // convert I16 to F32
    for (int i = 0; i < stateInp.n_samples; i++) {
        stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
    }
    // run the model
    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    params.print_realtime   = true;
    params.print_progress   = false;
    params.print_timestamps = true;
    params.print_special    = false;
    params.translate        = false;
    params.language         = "en";
    params.n_threads        = 4;
    params.offset_ms        = 0;
    CFTimeInterval startTime = CACurrentMediaTime();
    if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
        NSLog(@"Failed to run the model");
        _textviewResult.text = @"Failed to run the model";
        return;
    }
-    CFTimeInterval endTime = CACurrentMediaTime();
+    NSLog(@"Processing %d samples", stateInp.n_samples);
-    // clear the text in the textview
+    stateInp.isTranscribing = true;
    _textviewResult.text = @"";
-    int n_segments = whisper_full_n_segments(stateInp.ctx);
+    // dispatch the model to a background thread
-    for (int i = 0; i < n_segments; i++) {
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
-        const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
+        // process captured audio
        // convert I16 to F32
        for (int i = 0; i < self->stateInp.n_samples; i++) {
            self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
        }
-        // append the text to the textview
+        // run the model
-        _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
+        struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
    }
-    // internal model timing
+        // get maximum number of threads on this device (max 8)
-    whisper_print_timings(stateInp.ctx);
+        const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
-    NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
+        params.print_realtime   = true;
        params.print_progress   = false;
        params.print_timestamps = true;
        params.print_special    = false;
        params.translate        = false;
        params.language         = "en";
        params.n_threads        = max_threads;
        params.offset_ms        = 0;
        params.single_segment   = self->stateInp.isRealtime;
-    _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+        CFTimeInterval startTime = CACurrentMediaTime();
        whisper_reset_timings(self->stateInp.ctx);
        if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
            NSLog(@"Failed to run the model");
            self->_textviewResult.text = @"Failed to run the model";
            return;
        }
        whisper_print_timings(self->stateInp.ctx);
        CFTimeInterval endTime = CACurrentMediaTime();
        NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
        // result text
        NSString *result = @"";
        int n_segments = whisper_full_n_segments(self->stateInp.ctx);
        for (int i = 0; i < n_segments; i++) {
            const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
            // append the text to the result
            result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
        }
        // append processing time
        result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
        // dispatch the result to the main thread
        dispatch_async(dispatch_get_main_queue(), ^{
            self->_textviewResult.text = result;
            self->stateInp.isTranscribing = false;
        });
    });
 }
 //
-// Callback implmentation
+// Callback implementation
 //
 void AudioInputCallback(void * inUserData,
@ -224,6 +263,12 @@ void AudioInputCallback(void * inUserData,
    if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
        NSLog(@"Too much audio data, ignoring");
        dispatch_async(dispatch_get_main_queue(), ^{
            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
            [vc stopCapturing];
        });
        return;
    }
@ -235,6 +280,14 @@ void AudioInputCallback(void * inUserData,
    // put the buffer back in the queue
    AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
    if (stateInp->isRealtime) {
        // dipatch onTranscribe() to the main thread
        dispatch_async(dispatch_get_main_queue(), ^{
            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
            [vc onTranscribe:nil];
        });
    }
 }
@end
--- a/whisper.cpp
+++ b/whisper.cpp
@ -2386,6 +2386,21 @@ void whisper_reset_timings(struct whisper_context * ctx) {
    ctx->t_decode_us = 0;
 }
 const char * whisper_print_system_info(void) {
    static std::string s;
    s  = "";
    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
    return s.c_str();
 }
 ////////////////////////////////////////////////////////////////////////////
 struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@ -2863,7 +2878,7 @@ int whisper_full_parallel(
        struct whisper_full_params params,
        const float * samples,
        int n_samples,
-        const int n_processors) {
+        int n_processors) {
    if (n_processors == 1) {
        return whisper_full(ctx, params, samples, n_samples);
    }
@ -3040,21 +3055,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
    return ctx->result_all[i_segment].tokens[i_token].p;
 }
 const char * whisper_print_system_info(void) {
    static std::string s;
    s  = "";
    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
    return s.c_str();
 }
 // =================================================================================================
 //
--- a/whisper.h
+++ b/whisper.h
@ -72,16 +72,16 @@ extern "C" {
        whisper_token id;  // token id
        whisper_token tid; // forced timestamp token id
-        float p;     // probability of the token
+        float p;           // probability of the token
-        float pt;    // probability of the timestamp token
+        float pt;          // probability of the timestamp token
-        float ptsum; // sum of probabilities of all timestamp tokens
+        float ptsum;       // sum of probabilities of all timestamp tokens
        // token-level timestamp data
        // do not use if you haven't computed token-level timestamps
-        int64_t t0; // start time of the token
+        int64_t t0;        // start time of the token
-        int64_t t1; //   end time of the token
+        int64_t t1;        //   end time of the token
-        float vlen; // voice length of the token
+        float vlen;        // voice length of the token
    } whisper_token_data;
    // Allocates all memory needed for the model and loads the model from the given file.
@ -96,9 +96,9 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_pcm_to_mel(
            struct whisper_context * ctx,
-            const float * samples,
+                       const float * samples,
-            int n_samples,
+                               int   n_samples,
-            int n_threads);
+                               int   n_threads);
    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
@ -106,9 +106,9 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_set_mel(
            struct whisper_context * ctx,
-            const float * data,
+                       const float * data,
-            int n_len,
+                               int   n_len,
-            int n_mel);
+                               int   n_mel);
    // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
    // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
@ -116,8 +116,8 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_encode(
            struct whisper_context * ctx,
-            int offset,
+                               int   offset,
-            int n_threads);
+                               int   n_threads);
    // Run the Whisper decoder to obtain the logits and probabilities for the next token.
    // Make sure to call whisper_encode() first.
@ -126,10 +126,10 @@ extern "C" {
    // Returns 0 on success
    WHISPER_API int whisper_decode(
            struct whisper_context * ctx,
-            const whisper_token * tokens,
+               const whisper_token * tokens,
-            int n_tokens,
+                               int   n_tokens,
-            int n_past,
+                               int   n_past,
-            int n_threads);
+                               int   n_threads);
    // Token sampling methods.
    // These are provided for convenience and can be used after each call to whisper_decode().
@ -169,6 +169,9 @@ extern "C" {
    WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
    WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
    // Print system information
    WHISPER_API const char * whisper_print_system_info(void);
    ////////////////////////////////////////////////////////////////////////////
    // Available sampling strategies
@ -187,12 +190,12 @@ extern "C" {
        int n_threads;
        int n_max_text_ctx;
-        int offset_ms;      // start offset in ms
+        int offset_ms;          // start offset in ms
-        int duration_ms;    // audio duration to process in ms
+        int duration_ms;        // audio duration to process in ms
        bool translate;
        bool no_context;
-        bool single_segment; // force single segment output (useful for streaming)
+        bool single_segment;    // force single segment output (useful for streaming)
        bool print_special;
        bool print_progress;
        bool print_realtime;
@ -206,8 +209,8 @@ extern "C" {
        int   max_tokens;       // max tokens per segment (0 = no limit)
        // [EXPERIMENTAL] speed-up techniques
-        bool speed_up;  // speed-up the audio by 2x using Phase Vocoder
+        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
-        int  audio_ctx; // overwrite the audio context size (0 = use default)
+        int  audio_ctx;         // overwrite the audio context size (0 = use default)
        // tokens to provide the whisper model as initial prompt
        // these are prepended to any existing text context from a previous call
@ -235,20 +238,20 @@ extern "C" {
    // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
    // Uses the specified decoding strategy to obtain the text.
    WHISPER_API int whisper_full(
-            struct whisper_context * ctx,
+                struct whisper_context * ctx,
-            struct whisper_full_params params,
+            struct whisper_full_params   params,
-            const float * samples,
+                           const float * samples,
-            int n_samples);
+                                   int   n_samples);
    // Split the input audio in chunks and process each chunk separately using whisper_full()
    // It seems this approach can offer some speedup in some cases.
    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
    WHISPER_API int whisper_full_parallel(
-            struct whisper_context * ctx,
+                struct whisper_context * ctx,
-            struct whisper_full_params params,
+            struct whisper_full_params   params,
-            const float * samples,
+                           const float * samples,
-            int n_samples,
+                                   int   n_samples,
-            const int n_processors);
+                                   int   n_processors);
    // Number of generated text segments.
    // A segment can be a few words, a sentence, or even a paragraph.
@ -275,9 +278,6 @@ extern "C" {
    // Get the probability of the specified token in the specified segment.
    WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
    // Print system information
    WHISPER_API const char * whisper_print_system_info(void);
 #ifdef __cplusplus
 }
 #endif