forked from extern/whisper.cpp
298 lines
9.1 KiB
Objective-C
298 lines
9.1 KiB
Objective-C
//
|
|
// ViewController.m
|
|
// whisper.objc
|
|
//
|
|
// Created by Georgi Gerganov on 23.10.22.
|
|
//
|
|
|
|
#import "ViewController.h"
|
|
|
|
#import "whisper.h"
|
|
|
|
#define NUM_BYTES_PER_BUFFER 16*1024
|
|
|
|
// callback used to process captured audio
|
|
void AudioInputCallback(void * inUserData,
|
|
AudioQueueRef inAQ,
|
|
AudioQueueBufferRef inBuffer,
|
|
const AudioTimeStamp * inStartTime,
|
|
UInt32 inNumberPacketDescriptions,
|
|
const AudioStreamPacketDescription * inPacketDescs);
|
|
|
|
@interface ViewController ()
|
|
|
|
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
|
|
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
|
|
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
|
|
@property (weak, nonatomic) IBOutlet UIButton *buttonRealtime;
|
|
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;
|
|
|
|
@end
|
|
|
|
@implementation ViewController
|
|
|
|
- (void)setupAudioFormat:(AudioStreamBasicDescription*)format
|
|
{
|
|
format->mSampleRate = WHISPER_SAMPLE_RATE;
|
|
format->mFormatID = kAudioFormatLinearPCM;
|
|
format->mFramesPerPacket = 1;
|
|
format->mChannelsPerFrame = 1;
|
|
format->mBytesPerFrame = 2;
|
|
format->mBytesPerPacket = 2;
|
|
format->mBitsPerChannel = 16;
|
|
format->mReserved = 0;
|
|
format->mFormatFlags = kLinearPCMFormatFlagIsSignedInteger;
|
|
}
|
|
|
|
- (void)viewDidLoad {
|
|
[super viewDidLoad];
|
|
|
|
// whisper.cpp initialization
|
|
{
|
|
// load the model
|
|
NSString *modelPath = [[NSBundle mainBundle] pathForResource:@"ggml-base.en" ofType:@"bin"];
|
|
|
|
// check if the model exists
|
|
if (![[NSFileManager defaultManager] fileExistsAtPath:modelPath]) {
|
|
NSLog(@"Model file not found");
|
|
return;
|
|
}
|
|
|
|
NSLog(@"Loading model from %@", modelPath);
|
|
|
|
// create ggml context
|
|
stateInp.ctx = whisper_init([modelPath UTF8String]);
|
|
|
|
// check if the model was loaded successfully
|
|
if (stateInp.ctx == NULL) {
|
|
NSLog(@"Failed to load model");
|
|
return;
|
|
}
|
|
}
|
|
|
|
// initialize audio format and buffers
|
|
{
|
|
[self setupAudioFormat:&stateInp.dataFormat];
|
|
|
|
stateInp.n_samples = 0;
|
|
stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
|
|
stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
|
|
}
|
|
|
|
stateInp.isTranscribing = false;
|
|
stateInp.isRealtime = false;
|
|
}
|
|
|
|
-(IBAction) stopCapturing {
|
|
NSLog(@"Stop capturing");
|
|
|
|
_labelStatusInp.text = @"Status: Idle";
|
|
|
|
[_buttonToggleCapture setTitle:@"Start capturing" forState:UIControlStateNormal];
|
|
[_buttonToggleCapture setBackgroundColor:[UIColor grayColor]];
|
|
|
|
stateInp.isCapturing = false;
|
|
|
|
AudioQueueStop(stateInp.queue, true);
|
|
for (int i = 0; i < NUM_BUFFERS; i++) {
|
|
AudioQueueFreeBuffer(stateInp.queue, stateInp.buffers[i]);
|
|
}
|
|
|
|
AudioQueueDispose(stateInp.queue, true);
|
|
}
|
|
|
|
- (IBAction)toggleCapture:(id)sender {
|
|
if (stateInp.isCapturing) {
|
|
// stop capturing
|
|
[self stopCapturing];
|
|
|
|
return;
|
|
}
|
|
|
|
// initiate audio capturing
|
|
NSLog(@"Start capturing");
|
|
|
|
stateInp.n_samples = 0;
|
|
stateInp.vc = (__bridge void *)(self);
|
|
|
|
OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
|
|
AudioInputCallback,
|
|
&stateInp,
|
|
CFRunLoopGetCurrent(),
|
|
kCFRunLoopCommonModes,
|
|
0,
|
|
&stateInp.queue);
|
|
|
|
if (status == 0) {
|
|
for (int i = 0; i < NUM_BUFFERS; i++) {
|
|
AudioQueueAllocateBuffer(stateInp.queue, NUM_BYTES_PER_BUFFER, &stateInp.buffers[i]);
|
|
AudioQueueEnqueueBuffer (stateInp.queue, stateInp.buffers[i], 0, NULL);
|
|
}
|
|
|
|
stateInp.isCapturing = true;
|
|
status = AudioQueueStart(stateInp.queue, NULL);
|
|
if (status == 0) {
|
|
_labelStatusInp.text = @"Status: Capturing";
|
|
[sender setTitle:@"Stop Capturing" forState:UIControlStateNormal];
|
|
[_buttonToggleCapture setBackgroundColor:[UIColor redColor]];
|
|
}
|
|
}
|
|
|
|
if (status != 0) {
|
|
[self stopCapturing];
|
|
}
|
|
}
|
|
|
|
- (IBAction)onTranscribePrepare:(id)sender {
|
|
_textviewResult.text = @"Processing - please wait ...";
|
|
|
|
if (stateInp.isRealtime) {
|
|
[self onRealtime:(id)sender];
|
|
}
|
|
|
|
if (stateInp.isCapturing) {
|
|
[self stopCapturing];
|
|
}
|
|
}
|
|
|
|
- (IBAction)onRealtime:(id)sender {
|
|
stateInp.isRealtime = !stateInp.isRealtime;
|
|
|
|
if (stateInp.isRealtime) {
|
|
[_buttonRealtime setBackgroundColor:[UIColor greenColor]];
|
|
} else {
|
|
[_buttonRealtime setBackgroundColor:[UIColor grayColor]];
|
|
}
|
|
|
|
NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
|
|
}
|
|
|
|
- (IBAction)onTranscribe:(id)sender {
|
|
if (stateInp.isTranscribing) {
|
|
return;
|
|
}
|
|
|
|
NSLog(@"Processing %d samples", stateInp.n_samples);
|
|
|
|
stateInp.isTranscribing = true;
|
|
|
|
// dispatch the model to a background thread
|
|
dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
|
|
// process captured audio
|
|
// convert I16 to F32
|
|
for (int i = 0; i < self->stateInp.n_samples; i++) {
|
|
self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
|
|
}
|
|
|
|
// run the model
|
|
struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
|
|
|
// get maximum number of threads on this device (max 8)
|
|
const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
|
|
|
|
params.print_realtime = true;
|
|
params.print_progress = false;
|
|
params.print_timestamps = true;
|
|
params.print_special = false;
|
|
params.translate = false;
|
|
params.language = "en";
|
|
params.n_threads = max_threads;
|
|
params.offset_ms = 0;
|
|
params.no_context = true;
|
|
params.single_segment = self->stateInp.isRealtime;
|
|
|
|
CFTimeInterval startTime = CACurrentMediaTime();
|
|
|
|
whisper_reset_timings(self->stateInp.ctx);
|
|
|
|
if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
|
|
NSLog(@"Failed to run the model");
|
|
self->_textviewResult.text = @"Failed to run the model";
|
|
|
|
return;
|
|
}
|
|
|
|
whisper_print_timings(self->stateInp.ctx);
|
|
|
|
CFTimeInterval endTime = CACurrentMediaTime();
|
|
|
|
NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
|
|
|
|
// result text
|
|
NSString *result = @"";
|
|
|
|
int n_segments = whisper_full_n_segments(self->stateInp.ctx);
|
|
for (int i = 0; i < n_segments; i++) {
|
|
const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
|
|
|
|
// append the text to the result
|
|
result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
|
|
}
|
|
|
|
const float tRecording = (float)self->stateInp.n_samples / (float)self->stateInp.dataFormat.mSampleRate;
|
|
|
|
// append processing time
|
|
result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[recording time: %5.3f s]", tRecording]];
|
|
result = [result stringByAppendingString:[NSString stringWithFormat:@" \n[processing time: %5.3f s]", endTime - startTime]];
|
|
|
|
// dispatch the result to the main thread
|
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
self->_textviewResult.text = result;
|
|
self->stateInp.isTranscribing = false;
|
|
});
|
|
});
|
|
}
|
|
|
|
//
|
|
// Callback implementation
|
|
//
|
|
|
|
void AudioInputCallback(void * inUserData,
|
|
AudioQueueRef inAQ,
|
|
AudioQueueBufferRef inBuffer,
|
|
const AudioTimeStamp * inStartTime,
|
|
UInt32 inNumberPacketDescriptions,
|
|
const AudioStreamPacketDescription * inPacketDescs)
|
|
{
|
|
StateInp * stateInp = (StateInp*)inUserData;
|
|
|
|
if (!stateInp->isCapturing) {
|
|
NSLog(@"Not capturing, ignoring audio");
|
|
return;
|
|
}
|
|
|
|
const int n = inBuffer->mAudioDataByteSize / 2;
|
|
|
|
NSLog(@"Captured %d new samples", n);
|
|
|
|
if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
|
|
NSLog(@"Too much audio data, ignoring");
|
|
|
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
|
|
[vc stopCapturing];
|
|
});
|
|
|
|
return;
|
|
}
|
|
|
|
for (int i = 0; i < n; i++) {
|
|
stateInp->audioBufferI16[stateInp->n_samples + i] = ((short*)inBuffer->mAudioData)[i];
|
|
}
|
|
|
|
stateInp->n_samples += n;
|
|
|
|
// put the buffer back in the queue
|
|
AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
|
|
|
|
if (stateInp->isRealtime) {
|
|
// dipatch onTranscribe() to the main thread
|
|
dispatch_async(dispatch_get_main_queue(), ^{
|
|
ViewController * vc = (__bridge ViewController *)(stateInp->vc);
|
|
[vc onTranscribe:nil];
|
|
});
|
|
}
|
|
}
|
|
|
|
@end
|