Compare commits

..

2 Commits

41 changed files with 1051 additions and 6256 deletions

2
.gitignore vendored
View File

@ -34,5 +34,3 @@ examples/whisper.objc/whisper.objc.xcodeproj/xcuserdata/
examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata examples/whisper.objc/whisper.objc.xcodeproj/project.xcworkspace/xcuserdata
extra/bench-gg.txt extra/bench-gg.txt
*.mlmodel*

View File

@ -191,12 +191,10 @@ else()
if(NOT WHISPER_NO_FMA) if(NOT WHISPER_NO_FMA)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif() endif()
if(NOT WHISPER_NO_F16C)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif() endif()
endif() endif()
endif() endif()
endif()
if (WHISPER_PERF) if (WHISPER_PERF)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF) set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)

View File

@ -34,12 +34,6 @@ CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS = LDFLAGS =
# ref: https://github.com/ggerganov/whisper.cpp/issues/37
ifneq ($(wildcard /usr/include/musl/*),)
CFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
CXXFLAGS += -D_POSIX_SOURCE -D_GNU_SOURCE
endif
# OS specific # OS specific
# TODO: support Windows # TODO: support Windows
ifeq ($(UNAME_S),Linux) ifeq ($(UNAME_S),Linux)

View File

@ -433,19 +433,6 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a
--- ---
## Video comparison of different models
Use the [extra/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/extra/bench-wts.sh) script to generate a video in the following format:
```java
./extra/bench-wts.sh samples/jfk.wav
ffplay ./samples/jfk.wav.all.mp4
```
https://user-images.githubusercontent.com/1991296/223206245-2d36d903-cf8e-4f09-8c3b-eb9f9c39d6fc.mp4
---
## Benchmarks ## Benchmarks
In order to have an objective comparison of the performance of the inference across different system configurations, In order to have an objective comparison of the performance of the inference across different system configurations,
@ -466,7 +453,7 @@ The original models are converted to a custom binary format. This allows to pack
You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script You can download the converted models using the [models/download-ggml-model.sh](models/download-ggml-model.sh) script
or manually from here: or manually from here:
- https://huggingface.co/ggerganov/whisper.cpp - https://huggingface.co/datasets/ggerganov/whisper.cpp
- https://ggml.ggerganov.com - https://ggml.ggerganov.com
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README
@ -476,7 +463,6 @@ in [models](models).
- [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310) - [X] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
- [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309) - [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
- React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
- [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312) - [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
- [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507) - [X] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313) - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
@ -486,7 +472,6 @@ in [models](models).
- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9) - [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython) - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11) - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [X] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
## Examples ## Examples

View File

@ -17,7 +17,7 @@ import (
// CONSTANTS // CONSTANTS
const ( const (
srcUrl = "https://huggingface.co/ggerganov/whisper.cpp/resolve/main" // The location of the models srcUrl = "https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main" // The location of the models
srcExt = ".bin" // Filename extension srcExt = ".bin" // Filename extension
bufSize = 1024 * 64 // Size of the buffer used for downloading the model bufSize = 1024 * 64 // Size of the buffer used for downloading the model
) )

View File

@ -94,7 +94,6 @@ func (model *model) NewContext() (Context, error) {
params.SetPrintRealtime(false) params.SetPrintRealtime(false)
params.SetPrintTimestamps(false) params.SetPrintTimestamps(false)
params.SetThreads(runtime.NumCPU()) params.SetThreads(runtime.NumCPU())
params.SetNoContext(true)
// Return new context // Return new context
return newContext(model, params) return newContext(model, params)

View File

@ -20,7 +20,7 @@ extern bool callEncoderBegin(void* user_data);
// Text segment callback // Text segment callback
// Called on every newly generated text segment // Called on every newly generated text segment
// Use the whisper_full_...() functions to obtain the text segments // Use the whisper_full_...() functions to obtain the text segments
static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_state* state, int n_new, void* user_data) { static void whisper_new_segment_cb(struct whisper_context* ctx, int n_new, void* user_data) {
if(user_data != NULL && ctx != NULL) { if(user_data != NULL && ctx != NULL) {
callNewSegment(user_data, n_new); callNewSegment(user_data, n_new);
} }
@ -29,7 +29,7 @@ static void whisper_new_segment_cb(struct whisper_context* ctx, struct whisper_s
// Encoder begin callback // Encoder begin callback
// If not NULL, called before the encoder starts // If not NULL, called before the encoder starts
// If it returns false, the computation is aborted // If it returns false, the computation is aborted
static bool whisper_encoder_begin_cb(struct whisper_context* ctx, struct whisper_state* state, void* user_data) { static bool whisper_encoder_begin_cb(struct whisper_context* ctx, void* user_data) {
if(user_data != NULL && ctx != NULL) { if(user_data != NULL && ctx != NULL) {
return callEncoderBegin(user_data); return callEncoderBegin(user_data);
} }

View File

@ -199,7 +199,7 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
{ {
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) { rwp->params.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
bool is_aborted = *(bool*)user_data; bool is_aborted = *(bool*)user_data;
return !is_aborted; return !is_aborted;
}; };

View File

@ -63,5 +63,4 @@ else()
add_subdirectory(command) add_subdirectory(command)
add_subdirectory(bench) add_subdirectory(bench)
add_subdirectory(talk) add_subdirectory(talk)
add_subdirectory(talk.llama)
endif() endif()

View File

@ -72,7 +72,7 @@ int timestamp_to_sample(int64_t t, int n_samples) {
return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100))); return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
} }
void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data) { void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
const auto & params = *((whisper_print_user_data *) user_data)->params; const auto & params = *((whisper_print_user_data *) user_data)->params;
const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s; const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
@ -260,7 +260,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
{ {
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) { wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
bool is_aborted = *(bool*)user_data; bool is_aborted = *(bool*)user_data;
return !is_aborted; return !is_aborted;
}; };
@ -292,41 +292,13 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
return 0; return 0;
} }
class Worker : public Napi::AsyncWorker { Napi::Object whisper(const Napi::CallbackInfo& info) {
public:
Worker(Napi::Function& callback, whisper_params params)
: Napi::AsyncWorker(callback), params(params) {}
void Execute() override {
run(params, result);
}
void OnOK() override {
Napi::HandleScope scope(Env());
Napi::Object res = Napi::Array::New(Env(), result.size());
for (uint64_t i = 0; i < result.size(); ++i) {
Napi::Object tmp = Napi::Array::New(Env(), 3);
for (uint64_t j = 0; j < 3; ++j) {
tmp[j] = Napi::String::New(Env(), result[i][j]);
}
res[i] = tmp;
}
Callback().Call({Env().Null(), res});
}
private:
whisper_params params;
std::vector<std::vector<std::string>> result;
};
Napi::Value whisper(const Napi::CallbackInfo& info) {
Napi::Env env = info.Env(); Napi::Env env = info.Env();
if (info.Length() <= 0 || !info[0].IsObject()) { if (info.Length() <= 0 || !info[0].IsObject()) {
Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException(); Napi::TypeError::New(env, "object expected").ThrowAsJavaScriptException();
} }
whisper_params params; whisper_params params;
std::vector<std::vector<std::string>> result;
Napi::Object whisper_params = info[0].As<Napi::Object>(); Napi::Object whisper_params = info[0].As<Napi::Object>();
std::string language = whisper_params.Get("language").As<Napi::String>(); std::string language = whisper_params.Get("language").As<Napi::String>();
@ -337,10 +309,25 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
params.model = model; params.model = model;
params.fname_inp.emplace_back(input); params.fname_inp.emplace_back(input);
Napi::Function callback = info[1].As<Napi::Function>(); // run model
Worker* worker = new Worker(callback, params); run(params, result);
worker->Queue();
return env.Undefined(); fprintf(stderr, "RESULT:\n");
for (auto sentence:result) {
fprintf(stderr, "t0: %s, t1: %s, content: %s \n",
sentence[0].c_str(), sentence[1].c_str(), sentence[2].c_str());
}
Napi::Object res = Napi::Array::New(env, result.size());
for (uint64_t i = 0; i < result.size(); ++i) {
Napi::Object tmp = Napi::Array::New(env, 3);
for (uint64_t j = 0; j < 3; ++j) {
tmp[j] = Napi::String::New(env, result[i][j]);
}
res[i] = tmp;
}
return res;
} }

View File

@ -1,16 +1,10 @@
const path = require("path"); const path = require('path');
const { whisper } = require(path.join( const { whisper } = require(path.join(__dirname, '../../build/Release/whisper-addon'));
__dirname,
"../../build/Release/whisper-addon"
));
const { promisify } = require("util");
const whisperAsync = promisify(whisper);
const whisperParams = { const whisperParams = {
language: "en", language: 'en',
model: path.join(__dirname, "../../models/ggml-base.en.bin"), model: path.join(__dirname, '../../models/ggml-base.en.bin'),
fname_inp: "../../samples/jfk.wav", fname_inp: '',
}; };
const arguments = process.argv.slice(2); const arguments = process.argv.slice(2);
@ -20,7 +14,7 @@ const params = Object.fromEntries(
return [...pre, item.slice(2).split("=")]; return [...pre, item.slice(2).split("=")];
} }
return pre; return pre;
}, []) }, []),
); );
for (const key in params) { for (const key in params) {
@ -29,8 +23,5 @@ for (const key in params) {
} }
} }
console.log("whisperParams =", whisperParams); console.log('whisperParams =', whisperParams);
console.log(whisper(whisperParams));
whisperAsync(whisperParams).then((result) => {
console.log(`Result from whisper: ${result}`);
});

View File

@ -31,7 +31,6 @@ options:
-osrt, --output-srt [false ] output result in a srt file -osrt, --output-srt [false ] output result in a srt file
-owts, --output-words [false ] output script for generating karaoke video -owts, --output-words [false ] output script for generating karaoke video
-ocsv, --output-csv [false ] output result in a CSV file -ocsv, --output-csv [false ] output result in a CSV file
-oj, --output-json [false ] output result in a JSON file
-of FNAME, --output-file FNAME [ ] output file path (without file extension) -of FNAME, --output-file FNAME [ ] output file path (without file extension)
-ps, --print-special [false ] print special tokens -ps, --print-special [false ] print special tokens
-pc, --print-colors [false ] print colors -pc, --print-colors [false ] print colors

View File

@ -73,7 +73,6 @@ struct whisper_params {
bool output_srt = false; bool output_srt = false;
bool output_wts = false; bool output_wts = false;
bool output_csv = false; bool output_csv = false;
bool output_jsn = false;
bool print_special = false; bool print_special = false;
bool print_colors = false; bool print_colors = false;
bool print_progress = false; bool print_progress = false;
@ -81,7 +80,6 @@ struct whisper_params {
std::string language = "en"; std::string language = "en";
std::string prompt; std::string prompt;
std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
std::string model = "models/ggml-base.en.bin"; std::string model = "models/ggml-base.en.bin";
std::vector<std::string> fname_inp = {}; std::vector<std::string> fname_inp = {};
@ -129,9 +127,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; } else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; } else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; } else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; } else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); } else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; } else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; } else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
@ -178,9 +174,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false"); fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false"); fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false"); fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false"); fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false");
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", ""); fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false"); fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false"); fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
@ -199,7 +193,7 @@ struct whisper_print_user_data {
const std::vector<std::vector<float>> * pcmf32s; const std::vector<std::vector<float>> * pcmf32s;
}; };
void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) { void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
const auto & params = *((whisper_print_user_data *) user_data)->params; const auto & params = *((whisper_print_user_data *) user_data)->params;
const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s; const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
@ -371,144 +365,16 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
return true; return true;
} }
bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
std::ofstream fout(fname);
int indent = 0;
auto doindent = [&]() {
for (int i = 0; i < indent; i++) fout << "\t";
};
auto start_arr = [&](const char *name) {
doindent();
fout << "\"" << name << "\": [\n";
indent++;
};
auto end_arr = [&](bool end = false) {
indent--;
doindent();
fout << (end ? "]\n" : "},\n");
};
auto start_obj = [&](const char *name = nullptr) {
doindent();
if (name) {
fout << "\"" << name << "\": {\n";
} else {
fout << "{\n";
}
indent++;
};
auto end_obj = [&](bool end = false) {
indent--;
doindent();
fout << (end ? "}\n" : "},\n");
};
auto start_value = [&](const char *name) {
doindent();
fout << "\"" << name << "\": ";
};
auto value_s = [&](const char *name, const char *val, bool end = false) {
start_value(name);
fout << "\"" << val << (end ? "\"\n" : "\",\n");
};
auto end_value = [&](bool end = false) {
fout << (end ? "\n" : ",\n");
};
auto value_i = [&](const char *name, const int64_t val, bool end = false) {
start_value(name);
fout << val;
end_value(end);
};
auto value_b = [&](const char *name, const bool val, bool end = false) {
start_value(name);
fout << (val ? "true" : "false");
end_value(end);
};
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
return false;
}
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
start_obj();
value_s("systeminfo", whisper_print_system_info());
start_obj("model");
value_s("type", whisper_model_type_readable(ctx));
value_b("multilingual", whisper_is_multilingual(ctx));
value_i("vocab", whisper_model_n_vocab(ctx));
start_obj("audio");
value_i("ctx", whisper_model_n_audio_ctx(ctx));
value_i("state", whisper_model_n_audio_state(ctx));
value_i("head", whisper_model_n_audio_head(ctx));
value_i("layer", whisper_model_n_audio_layer(ctx), true);
end_obj();
start_obj("text");
value_i("ctx", whisper_model_n_text_ctx(ctx));
value_i("state", whisper_model_n_text_state(ctx));
value_i("head", whisper_model_n_text_head(ctx));
value_i("leyer", whisper_model_n_text_layer(ctx), true);
end_obj();
value_i("mels", whisper_model_n_mels(ctx));
value_i("f16", whisper_model_f16(ctx), true);
end_obj();
start_obj("params");
value_s("model", params.model.c_str());
value_s("language", params.language.c_str());
value_b("translate", params.translate, true);
end_obj();
start_obj("result");
value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
end_obj();
start_arr("transcription");
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
start_obj();
start_obj("timestanps");
value_s("from", to_timestamp(t0, true).c_str());
value_s("to", to_timestamp(t1, true).c_str(), true);
end_obj();
start_obj("offsets");
value_i("from", t0 * 10);
value_i("to", t1 * 10, true);
end_obj();
value_s("text", text, true);
end_obj(i == (n_segments - 1));
}
end_arr(true);
end_obj(true);
return true;
}
// karaoke video generation // karaoke video generation
// outputs a bash script that uses ffmpeg to generate a video with the subtitles // outputs a bash script that uses ffmpeg to generate a video with the subtitles
// TODO: font parameter adjustments // TODO: font parameter adjustments
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) { bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
std::ofstream fout(fname); std::ofstream fout(fname);
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname); fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
static const char * font = params.font_path.c_str(); // TODO: become parameter
static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
std::ifstream fin(font);
if (!fin.is_open()) {
fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
return false;
}
fout << "#!/bin/bash" << "\n"; fout << "#!/bin/bash" << "\n";
fout << "\n"; fout << "\n";
@ -742,7 +608,7 @@ int main(int argc, char ** argv) {
{ {
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) { wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, void * user_data) {
bool is_aborted = *(bool*)user_data; bool is_aborted = *(bool*)user_data;
return !is_aborted; return !is_aborted;
}; };
@ -788,12 +654,6 @@ int main(int argc, char ** argv) {
const auto fname_csv = fname_out + ".csv"; const auto fname_csv = fname_out + ".csv";
output_csv(ctx, fname_csv.c_str()); output_csv(ctx, fname_csv.c_str());
} }
// output to JSON file
if (params.output_jsn) {
const auto fname_jsn = fname_out + ".json";
output_json(ctx, fname_jsn.c_str(), params);
}
} }
} }

View File

@ -288,6 +288,7 @@ int main(int argc, char ** argv) {
wparams.print_realtime = false; wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps; wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate; wparams.translate = params.translate;
wparams.no_context = true;
wparams.single_segment = !use_vad; wparams.single_segment = !use_vad;
wparams.max_tokens = params.max_tokens; wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str(); wparams.language = params.language.c_str();

View File

@ -1,2 +0,0 @@
eleven-labs.py
audio.mp3

View File

@ -1,12 +0,0 @@
if (WHISPER_SUPPORT_SDL2)
# talk.llama
set(TARGET talk-llama)
# TODO: this is temporary
# need to export ggml symbols for MSVC, but too lazy ..
add_executable(${TARGET} talk-llama.cpp llama.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
endif ()

View File

@ -1,2 +0,0 @@
# talk.llama

File diff suppressed because it is too large Load Diff

View File

@ -1,153 +0,0 @@
#ifndef LLAMA_H
#define LLAMA_H
#include <stddef.h>
#include <stdint.h>
#include <stdbool.h>
#ifdef LLAMA_SHARED
# ifdef _WIN32
# ifdef LLAMA_BUILD
# define LLAMA_API __declspec(dllexport)
# else
# define LLAMA_API __declspec(dllimport)
# endif
# else
# define LLAMA_API __attribute__ ((visibility ("default")))
# endif
#else
# define LLAMA_API
#endif
#define LLAMA_FILE_VERSION 1
#define LLAMA_FILE_MAGIC 0x67676d66 // 'ggmf' in hex
#define LLAMA_FILE_MAGIC_UNVERSIONED 0x67676d6c // pre-versioned files
#ifdef __cplusplus
extern "C" {
#endif
//
// C interface
//
// TODO: show sample usage
//
struct llama_context;
typedef int llama_token;
typedef struct llama_token_data {
llama_token id; // token id
float p; // probability of the token
float plog; // log probability of the token
} llama_token_data;
typedef void (*llama_progress_callback)(double progress, void *ctx);
struct llama_context_params {
int n_ctx; // text context
int n_parts; // -1 for default
int seed; // RNG seed, 0 for random
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only
// called with a progress value between 0 and 1, pass NULL to disable
llama_progress_callback progress_callback;
// context pointer passed to the progress callback
void * progress_callback_user_data;
};
LLAMA_API struct llama_context_params llama_context_default_params();
// Various functions for loading a ggml llama model.
// Allocate (almost) all memory needed for the model.
// Return NULL on failure
LLAMA_API struct llama_context * llama_init_from_file(
const char * path_model,
struct llama_context_params params);
// Frees all allocated memory
LLAMA_API void llama_free(struct llama_context * ctx);
// TODO: not great API - very likely to change
// Returns 0 on success
LLAMA_API int llama_model_quantize(
const char * fname_inp,
const char * fname_out,
int itype,
int qk);
// Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process
// n_past is the number of tokens to use from previous eval calls
// Returns 0 on success
LLAMA_API int llama_eval(
struct llama_context * ctx,
const llama_token * tokens,
int n_tokens,
int n_past,
int n_threads);
// Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens
// Returns a negative number on failure - the number of tokens that would have been returned
// TODO: not sure if correct
LLAMA_API int llama_tokenize(
struct llama_context * ctx,
const char * text,
llama_token * tokens,
int n_max_tokens,
bool add_bos);
LLAMA_API int llama_n_vocab(struct llama_context * ctx);
LLAMA_API int llama_n_ctx (struct llama_context * ctx);
LLAMA_API int llama_n_embd (struct llama_context * ctx);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token
// Rows: n_tokens
// Cols: n_vocab
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
// Get the embeddings for the input
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
LLAMA_API const char * llama_token_to_str(struct llama_context * ctx, llama_token token);
// Special tokens
LLAMA_API llama_token llama_token_bos();
LLAMA_API llama_token llama_token_eos();
// TODO: improve the last_n_tokens interface ?
LLAMA_API llama_token llama_sample_top_p_top_k(
struct llama_context * ctx,
const llama_token * last_n_tokens_data,
int last_n_tokens_size,
int top_k,
double top_p,
double temp,
double repeat_penalty);
// Performance information
LLAMA_API void llama_print_timings(struct llama_context * ctx);
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
// Print system information
LLAMA_API const char * llama_print_system_info(void);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -1,20 +0,0 @@
#!/bin/bash
# Usage:
# speak.sh <voice_id> <text-to-speak>
# espeak
# Mac OS: brew install espeak
# Linux: apt-get install espeak
#
#espeak -v en-us+m$1 -s 225 -p 50 -a 200 -g 5 -k 5 "$2"
# for Mac
say "$2"
# Eleven Labs
#
#wd=$(dirname $0)
#script=$wd/eleven-labs.py
#python3 $script $1 "$2" >/dev/null 2>&1
#ffplay -autoexit -nodisp -loglevel quiet -hide_banner -i ./audio.mp3 >/dev/null 2>&1

View File

@ -1,511 +0,0 @@
// Talk with AI
//
#include "common.h"
#include "common-sdl.h"
#include "whisper.h"
#include "llama.h"
#include <cassert>
#include <cstdio>
#include <fstream>
#include <regex>
#include <string>
#include <thread>
#include <vector>
#include <regex>
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
// initialize to prompt numer of chars, since n_tokens <= n_prompt_chars
std::vector<llama_token> res(text.size() + (int)add_bos);
int n = llama_tokenize(ctx, text.c_str(), res.data(), res.size(), add_bos);
assert(n >= 0);
res.resize(n);
return res;
}
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t voice_ms = 10000;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
bool no_timestamps = true;
std::string person = "Santa";
std::string language = "en";
std::string model_wsp = "models/ggml-base.en.bin";
std::string model_llama = "models/ggml-llama-7B.bin";
std::string speak = "./examples/talk/speak.sh";
std::string fname_out;
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-vms" || arg == "--voice-ms") { params.voice_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-p" || arg == "--person") { params.person = argv[++i]; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-mw" || arg == "--model-whisper") { params.model_wsp = argv[++i]; }
else if (arg == "-ml" || arg == "--model-llama") { params.model_llama = argv[++i]; }
else if (arg == "-s" || arg == "--speak") { params.speak = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -vms N, --voice-ms N [%-7d] voice duration in milliseconds\n", params.voice_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -p NAME, --person NAME [%-7s] person name (for prompt selection)\n", params.person.c_str());
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -mw FILE, --model-whisper [%-7s] whisper model file\n", params.model_wsp.c_str());
fprintf(stderr, " -mg FILE, --model-llama [%-7s] llama model file\n", params.model_llama.c_str());
fprintf(stderr, " -s FILE, --speak TEXT [%-7s] command for TTS\n", params.speak.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, "\n");
}
std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
const auto t_start = std::chrono::high_resolution_clock::now();
prob = 0.0f;
t_ms = 0;
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.single_segment = true;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
return "";
}
int prob_n = 0;
std::string result;
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
result += text;
const int n_tokens = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < n_tokens; ++j) {
const auto token = whisper_full_get_token_data(ctx, i, j);
prob += token.p;
++prob_n;
}
}
if (prob_n > 0) {
prob /= prob_n;
}
const auto t_end = std::chrono::high_resolution_clock::now();
t_ms = std::chrono::duration_cast<std::chrono::milliseconds>(t_end - t_start).count();
return result;
}
// need to have leading ' '
//const std::string k_prompt = R"( Transcript of a dialog, where {1} interacts with an Assistant named Bob. Bob is helpful, kind, honest, good at writing, and never fails to answer {1}'s requests immediately and with precision.
//
//{0}: Hello, Bob.
//{1}: Hello {0}. How may I help you today?
//{0}:)";
const std::string k_prompt = R"( Text transcript of a never ending dialog, where {0} interacts with an AI assistant named {1}.
{1} is helpful, kind, honest, friendly, good at writing and never fails to answer {0}s requests immediately and with details and precision.
There are no annotations like (30 seconds passed...) or (to himself), just what {0} and {1} say aloud to each other.
The transcript only includes text, it does not include markup like HTML and Markdown.
{1} answers responds with short and concise answers.
{0}{4} Hello, {1}!
{1}{4} Hello {0}! How may I help you today?
{0}{4} What time is it?
{1}{4} It is {2} o'clock.
{0}{4} What year is it?
{1}{4} We are in {3}.
{0}{4} What is a cat?
{1}{4} A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
{0}{4} Name a color.
{1}{4} Blue
{0}{4})";
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
if (whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
// whisper init
struct whisper_context * ctx_wsp = whisper_init_from_file(params.model_wsp.c_str());
// llama init
auto lparams = llama_context_default_params();
lparams.n_ctx = 512;
lparams.n_parts = 2; // TODO fix
lparams.seed = 1; // TODO fix
lparams.f16_kv = true;
struct llama_context * ctx_llama = llama_init_from_file(params.model_llama.c_str(), lparams);
// print some info about the processing
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx_wsp)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
fprintf(stderr, "%s: processing, %d threads, lang = %s, task = %s, timestamps = %d ...\n",
__func__,
params.n_threads,
params.language.c_str(),
params.translate ? "translate" : "transcribe",
params.no_timestamps ? 0 : 1);
fprintf(stderr, "\n");
}
// init audio
audio_async audio(30*1000);
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
audio.resume();
int n_iter = 0;
bool is_running = true;
bool force_speak = false;
float prob0 = 0.0f;
const std::string chat_symb = ":";
const std::string bot_name = "LLAMA";
std::vector<float> pcmf32_cur;
std::vector<float> pcmf32_prompt;
std::string prompt_org = k_prompt;
prompt_org = ::replace(prompt_org, "{0}", params.person);
prompt_org = ::replace(prompt_org, "{1}", bot_name);
{
// get time string
std::string time_str;
{
time_t t = time(0);
struct tm * now = localtime(&t);
char buf[128];
strftime(buf, sizeof(buf), "%H:%M", now);
time_str = buf;
}
prompt_org = ::replace(prompt_org, "{2}", time_str);
}
{
// get year string
std::string year_str;
{
time_t t = time(0);
struct tm * now = localtime(&t);
char buf[128];
strftime(buf, sizeof(buf), "%Y", now);
year_str = buf;
}
prompt_org = ::replace(prompt_org, "{3}", year_str);
}
prompt_org = ::replace(prompt_org, "{4}", chat_symb);
auto embd_inp = ::llama_tokenize(ctx_llama, prompt_org, true);
const int n_ctx = llama_n_ctx(ctx_llama);
printf("\n");
printf("%s : initializing - please wait ...\n", __func__);
if (llama_eval(ctx_llama, embd_inp.data(), embd_inp.size(), 0, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
//fprintf(stdout, "\n");
//fprintf(stdout, "%s", prompt_org.c_str());
//fflush(stdout);
printf("%s : done! start speaking in the microphone\n", __func__);
printf("\n");
printf("%s%s", params.person.c_str(), chat_symb.c_str());
fflush(stdout);
audio.clear();
const int n_keep = embd_inp.size();
const int voice_id = 2;
int n_past = n_keep;
int n_prev = 64; // TODO arg
std::vector<llama_token> embd;
std::vector<std::string> antiprompts = {
params.person + chat_symb,
};
// main loop
while (is_running) {
// handle Ctrl + C
is_running = sdl_poll_events();
if (!is_running) {
break;
}
// delay
std::this_thread::sleep_for(std::chrono::milliseconds(100));
int64_t t_ms = 0;
{
audio.get(2000, pcmf32_cur);
if (::vad_simple(pcmf32_cur, WHISPER_SAMPLE_RATE, 1250, params.vad_thold, params.freq_thold, params.print_energy) || force_speak) {
//fprintf(stdout, "%s: Speech detected! Processing ...\n", __func__);
audio.get(params.voice_ms, pcmf32_cur);
std::string text_heard;
if (!force_speak) {
text_heard = ::trim(::transcribe(ctx_wsp, params, pcmf32_cur, prob0, t_ms));
}
// remove text between brackets using regex
{
std::regex re("\\[.*?\\]");
text_heard = std::regex_replace(text_heard, re, "");
}
// remove text between brackets using regex
{
std::regex re("\\(.*?\\)");
text_heard = std::regex_replace(text_heard, re, "");
}
// remove all characters, except for letters, numbers, punctuation and ':', '\'', '-', ' '
text_heard = std::regex_replace(text_heard, std::regex("[^a-zA-Z0-9\\.,\\?!\\s\\:\\'\\-]"), "");
// take first line
text_heard = text_heard.substr(0, text_heard.find_first_of('\n'));
// remove leading and trailing whitespace
text_heard = std::regex_replace(text_heard, std::regex("^\\s+"), "");
text_heard = std::regex_replace(text_heard, std::regex("\\s+$"), "");
const std::vector<llama_token> tokens = llama_tokenize(ctx_llama, text_heard.c_str(), false);
if (text_heard.empty() || tokens.empty() || force_speak) {
//fprintf(stdout, "%s: Heard nothing, skipping ...\n", __func__);
audio.clear();
continue;
}
force_speak = false;
text_heard.insert(0, 1, ' ');
text_heard += "\n" + bot_name + chat_symb;
fprintf(stdout, "%s%s%s", "\033[1m", text_heard.c_str(), "\033[0m");
fflush(stdout);
embd = ::llama_tokenize(ctx_llama, text_heard, false);
// text inference
bool done = false;
std::string text_to_speak;
while (true) {
// predict
if (embd.size() > 0) {
if (n_past + (int) embd.size() > n_ctx) {
n_past = n_keep;
// insert n_left/2 tokens at the start of embd from last_n_tokens
embd.insert(embd.begin(), embd_inp.begin() + embd_inp.size() - n_prev, embd_inp.end());
//printf("\n---\n");
//printf("resetting: '");
//for (int i = 0; i < (int) embd.size(); i++) {
// printf("%s", llama_token_to_str(ctx_llama, embd[i]));
//}
//printf("'\n");
//printf("\n---\n");
}
if (llama_eval(ctx_llama, embd.data(), embd.size(), n_past, params.n_threads)) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return 1;
}
}
//printf("n_iter = %d, n_past = %d, n_ctx = %d, n_keep = %d, n_prev = %d, embd.size() = %d\n", n_iter, n_past, n_ctx, n_keep, n_prev, (int) embd.size());
embd_inp.insert(embd_inp.end(), embd.begin(), embd.end());
n_past += embd.size();
embd.clear();
if (done) break;
{
// out of user input, sample next token
const float top_k = 5;
const float top_p = 0.80f;
const float temp = 0.30f;
const float repeat_penalty = 1.1764f;
const int repeat_last_n = 256;
llama_token id = 0;
{
//auto logits = llama_get_logits(ctx_llama);
//logits[llama_token_eos()] = 0;
id = llama_sample_top_p_top_k(ctx_llama,
embd_inp.data() + std::max(0, n_past - repeat_last_n),
repeat_last_n, top_k, top_p, temp, repeat_penalty);
}
if (id != llama_token_eos()) {
// add it to the context
embd.push_back(id);
text_to_speak += llama_token_to_str(ctx_llama, id);
printf("%s", llama_token_to_str(ctx_llama, id));
} else {
// TODO
printf("EOS TOKEN - SHOULD NOT HAPPEN\n");
exit(0);
}
}
{
std::string last_output;
for (int i = embd_inp.size() - 16; i < (int) embd_inp.size(); i++) {
last_output += llama_token_to_str(ctx_llama, embd_inp[i]);
}
last_output += llama_token_to_str(ctx_llama, embd[0]);
for (std::string & antiprompt : antiprompts) {
if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) {
done = true;
text_to_speak = ::replace(text_to_speak, antiprompt, "");
fflush(stdout);
break;
}
}
}
is_running = sdl_poll_events();
if (!is_running) {
break;
}
}
text_to_speak = ::replace(text_to_speak, "\"", "");
system((params.speak + " " + std::to_string(voice_id) + " \"" + text_to_speak + "\"").c_str());
audio.clear();
++n_iter;
}
}
}
audio.pause();
whisper_print_timings(ctx_wsp);
whisper_free(ctx_wsp);
return 0;
}

View File

@ -31,7 +31,7 @@ To run this, you will need a ggml GPT-2 model: [instructions](https://github.com
Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this: Alternatively, you can simply download the smallest ggml GPT-2 117M model (240 MB) like this:
``` ```
wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin wget --quiet --show-progress -O models/ggml-gpt-2-117M.bin https://huggingface.co/datasets/ggerganov/ggml/raw/main/ggml-model-gpt-2-117M.bin
``` ```
## TTS ## TTS

View File

@ -9,4 +9,4 @@ To use:
5. Select the "release" active build variant, and use Android Studio to run and deploy to your device. 5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
[^1]: I recommend the tiny or base models for running on an Android device. [^1]: I recommend the tiny or base models for running on an Android device.
<img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg"> <img width="300" alt="image" src="https://user-images.githubusercontent.com/1991296/208154256-82d972dc-221b-48c4-bfcb-36ce68602f93.png">

View File

@ -2,7 +2,6 @@ package com.whispercppdemo.ui.main
import androidx.compose.foundation.layout.* import androidx.compose.foundation.layout.*
import androidx.compose.foundation.rememberScrollState import androidx.compose.foundation.rememberScrollState
import androidx.compose.foundation.text.selection.SelectionContainer
import androidx.compose.foundation.verticalScroll import androidx.compose.foundation.verticalScroll
import androidx.compose.material3.* import androidx.compose.material3.*
import androidx.compose.runtime.Composable import androidx.compose.runtime.Composable
@ -20,7 +19,6 @@ fun MainScreen(viewModel: MainScreenViewModel) {
canTranscribe = viewModel.canTranscribe, canTranscribe = viewModel.canTranscribe,
isRecording = viewModel.isRecording, isRecording = viewModel.isRecording,
messageLog = viewModel.dataLog, messageLog = viewModel.dataLog,
onBenchmarkTapped = viewModel::benchmark,
onTranscribeSampleTapped = viewModel::transcribeSample, onTranscribeSampleTapped = viewModel::transcribeSample,
onRecordTapped = viewModel::toggleRecord onRecordTapped = viewModel::toggleRecord
) )
@ -32,7 +30,6 @@ private fun MainScreen(
canTranscribe: Boolean, canTranscribe: Boolean,
isRecording: Boolean, isRecording: Boolean,
messageLog: String, messageLog: String,
onBenchmarkTapped: () -> Unit,
onTranscribeSampleTapped: () -> Unit, onTranscribeSampleTapped: () -> Unit,
onRecordTapped: () -> Unit onRecordTapped: () -> Unit
) { ) {
@ -48,11 +45,8 @@ private fun MainScreen(
.padding(innerPadding) .padding(innerPadding)
.padding(16.dp) .padding(16.dp)
) { ) {
Column(verticalArrangement = Arrangement.SpaceBetween) { Row(horizontalArrangement = Arrangement.SpaceBetween) {
Row(horizontalArrangement = Arrangement.SpaceBetween, modifier = Modifier.fillMaxWidth()) {
BenchmarkButton(enabled = canTranscribe, onClick = onBenchmarkTapped)
TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped) TranscribeSampleButton(enabled = canTranscribe, onClick = onTranscribeSampleTapped)
}
RecordButton( RecordButton(
enabled = canTranscribe, enabled = canTranscribe,
isRecording = isRecording, isRecording = isRecording,
@ -66,17 +60,8 @@ private fun MainScreen(
@Composable @Composable
private fun MessageLog(log: String) { private fun MessageLog(log: String) {
SelectionContainer() {
Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log) Text(modifier = Modifier.verticalScroll(rememberScrollState()), text = log)
} }
}
@Composable
private fun BenchmarkButton(enabled: Boolean, onClick: () -> Unit) {
Button(onClick = onClick, enabled = enabled) {
Text("Benchmark")
}
}
@Composable @Composable
private fun TranscribeSampleButton(enabled: Boolean, onClick: () -> Unit) { private fun TranscribeSampleButton(enabled: Boolean, onClick: () -> Unit) {

View File

@ -41,15 +41,10 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
init { init {
viewModelScope.launch { viewModelScope.launch {
printSystemInfo()
loadData() loadData()
} }
} }
private suspend fun printSystemInfo() {
printMessage(String.format("System Info: %s\n", WhisperContext.getSystemInfo()));
}
private suspend fun loadData() { private suspend fun loadData() {
printMessage("Loading data...\n") printMessage("Loading data...\n")
try { try {
@ -86,29 +81,10 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
//whisperContext = WhisperContext.createContextFromFile(firstModel.absolutePath) //whisperContext = WhisperContext.createContextFromFile(firstModel.absolutePath)
} }
fun benchmark() = viewModelScope.launch {
runBenchmark(6)
}
fun transcribeSample() = viewModelScope.launch { fun transcribeSample() = viewModelScope.launch {
transcribeAudio(getFirstSample()) transcribeAudio(getFirstSample())
} }
private suspend fun runBenchmark(nthreads: Int) {
if (!canTranscribe) {
return
}
canTranscribe = false
printMessage("Running benchmark. This will take minutes...\n")
whisperContext?.benchMemory(nthreads)?.let{ printMessage(it) }
printMessage("\n")
whisperContext?.benchGgmlMulMat(nthreads)?.let{ printMessage(it) }
canTranscribe = true
}
private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) { private suspend fun getFirstSample(): File = withContext(Dispatchers.IO) {
samplesPath.listFiles()!!.first() samplesPath.listFiles()!!.first()
} }
@ -138,14 +114,11 @@ class MainScreenViewModel(private val application: Application) : ViewModel() {
canTranscribe = false canTranscribe = false
try { try {
printMessage("Reading wave samples... ") printMessage("Reading wave samples...\n")
val data = readAudioSamples(file) val data = readAudioSamples(file)
printMessage("${data.size / (16000 / 1000)} ms\n")
printMessage("Transcribing data...\n") printMessage("Transcribing data...\n")
val start = System.currentTimeMillis()
val text = whisperContext?.transcribeData(data) val text = whisperContext?.transcribeData(data)
val elapsed = System.currentTimeMillis() - start printMessage("Done: $text\n")
printMessage("Done ($elapsed ms): $text\n")
} catch (e: Exception) { } catch (e: Exception) {
Log.w(LOG_TAG, e) Log.w(LOG_TAG, e)
printMessage("${e.localizedMessage}\n") printMessage("${e.localizedMessage}\n")

View File

@ -27,14 +27,6 @@ class WhisperContext private constructor(private var ptr: Long) {
} }
} }
suspend fun benchMemory(nthreads: Int): String = withContext(scope.coroutineContext) {
return@withContext WhisperLib.benchMemcpy(nthreads)
}
suspend fun benchGgmlMulMat(nthreads: Int): String = withContext(scope.coroutineContext) {
return@withContext WhisperLib.benchGgmlMulMat(nthreads)
}
suspend fun release() = withContext(scope.coroutineContext) { suspend fun release() = withContext(scope.coroutineContext) {
if (ptr != 0L) { if (ptr != 0L) {
WhisperLib.freeContext(ptr) WhisperLib.freeContext(ptr)
@ -74,10 +66,6 @@ class WhisperContext private constructor(private var ptr: Long) {
} }
return WhisperContext(ptr) return WhisperContext(ptr)
} }
fun getSystemInfo(): String {
return WhisperLib.getSystemInfo()
}
} }
} }
@ -86,7 +74,6 @@ private class WhisperLib {
init { init {
Log.d(LOG_TAG, "Primary ABI: ${Build.SUPPORTED_ABIS[0]}") Log.d(LOG_TAG, "Primary ABI: ${Build.SUPPORTED_ABIS[0]}")
var loadVfpv4 = false var loadVfpv4 = false
var loadV8fp16 = false
if (isArmEabiV7a()) { if (isArmEabiV7a()) {
// armeabi-v7a needs runtime detection support // armeabi-v7a needs runtime detection support
val cpuInfo = cpuInfo() val cpuInfo = cpuInfo()
@ -97,24 +84,11 @@ private class WhisperLib {
loadVfpv4 = true loadVfpv4 = true
} }
} }
} else if (isArmEabiV8a()) {
// ARMv8.2a needs runtime detection support
val cpuInfo = cpuInfo()
cpuInfo?.let {
Log.d(LOG_TAG, "CPU info: $cpuInfo")
if (cpuInfo.contains("fphp")) {
Log.d(LOG_TAG, "CPU supports fp16 arithmetic")
loadV8fp16 = true
}
}
} }
if (loadVfpv4) { if (loadVfpv4) {
Log.d(LOG_TAG, "Loading libwhisper_vfpv4.so") Log.d(LOG_TAG, "Loading libwhisper_vfpv4.so")
System.loadLibrary("whisper_vfpv4") System.loadLibrary("whisper_vfpv4")
} else if (loadV8fp16) {
Log.d(LOG_TAG, "Loading libwhisper_v8fp16_va.so")
System.loadLibrary("whisper_v8fp16_va")
} else { } else {
Log.d(LOG_TAG, "Loading libwhisper.so") Log.d(LOG_TAG, "Loading libwhisper.so")
System.loadLibrary("whisper") System.loadLibrary("whisper")
@ -129,9 +103,6 @@ private class WhisperLib {
external fun fullTranscribe(contextPtr: Long, audioData: FloatArray) external fun fullTranscribe(contextPtr: Long, audioData: FloatArray)
external fun getTextSegmentCount(contextPtr: Long): Int external fun getTextSegmentCount(contextPtr: Long): Int
external fun getTextSegment(contextPtr: Long, index: Int): String external fun getTextSegment(contextPtr: Long, index: Int): String
external fun getSystemInfo(): String
external fun benchMemcpy(nthread: Int): String
external fun benchGgmlMulMat(nthread: Int): String
} }
} }
@ -139,10 +110,6 @@ private fun isArmEabiV7a(): Boolean {
return Build.SUPPORTED_ABIS[0].equals("armeabi-v7a") return Build.SUPPORTED_ABIS[0].equals("armeabi-v7a")
} }
private fun isArmEabiV8a(): Boolean {
return Build.SUPPORTED_ABIS[0].equals("arm64-v8a")
}
private fun cpuInfo(): String? { private fun cpuInfo(): String? {
return try { return try {
File("/proc/cpuinfo").inputStream().bufferedReader().use { File("/proc/cpuinfo").inputStream().bufferedReader().use {

View File

@ -13,14 +13,3 @@ ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
LOCAL_CFLAGS += -mfpu=neon-vfpv4 LOCAL_CFLAGS += -mfpu=neon-vfpv4
include $(BUILD_SHARED_LIBRARY) include $(BUILD_SHARED_LIBRARY)
endif endif
ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
include $(CLEAR_VARS)
LOCAL_MODULE := libwhisper_v8fp16_va
include $(LOCAL_PATH)/Whisper.mk
# Allow building NEON FMA code.
# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
LOCAL_CFLAGS += -march=armv8.2-a+fp16
include $(BUILD_SHARED_LIBRARY)
endif

View File

@ -6,7 +6,6 @@
#include <sys/sysinfo.h> #include <sys/sysinfo.h>
#include <string.h> #include <string.h>
#include "whisper.h" #include "whisper.h"
#include "ggml.h"
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
#define TAG "JNI" #define TAG "JNI"
@ -215,29 +214,3 @@ Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getTextSegment(
jstring string = (*env)->NewStringUTF(env, text); jstring string = (*env)->NewStringUTF(env, text);
return string; return string;
} }
JNIEXPORT jstring JNICALL
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_getSystemInfo(
JNIEnv *env, jobject thiz
) {
UNUSED(thiz);
const char *sysinfo = whisper_print_system_info();
jstring string = (*env)->NewStringUTF(env, sysinfo);
return string;
}
JNIEXPORT jstring JNICALL
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchMemcpy(JNIEnv *env, jobject thiz,
jint n_threads) {
UNUSED(thiz);
const char *bench_ggml_memcpy = whisper_bench_memcpy_str(n_threads);
jstring string = (*env)->NewStringUTF(env, bench_ggml_memcpy);
}
JNIEXPORT jstring JNICALL
Java_com_whispercppdemo_whisper_WhisperLib_00024Companion_benchGgmlMulMat(JNIEnv *env, jobject thiz,
jint n_threads) {
UNUSED(thiz);
const char *bench_ggml_mul_mat = whisper_bench_ggml_mul_mat_str(n_threads);
jstring string = (*env)->NewStringUTF(env, bench_ggml_mul_mat);
}

View File

@ -24,5 +24,3 @@ Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag in Build Pha
This can significantly improve the performance of the transcription: This can significantly improve the performance of the transcription:
<img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png"> <img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.

View File

@ -296,10 +296,6 @@
IPHONEOS_DEPLOYMENT_TARGET = 16.0; IPHONEOS_DEPLOYMENT_TARGET = 16.0;
MTL_ENABLE_DEBUG_INFO = NO; MTL_ENABLE_DEBUG_INFO = NO;
MTL_FAST_MATH = YES; MTL_FAST_MATH = YES;
OTHER_CFLAGS = (
"-O3",
"-DNDEBUG",
);
SDKROOT = iphoneos; SDKROOT = iphoneos;
VALIDATE_PRODUCT = YES; VALIDATE_PRODUCT = YES;
}; };

View File

@ -7,9 +7,8 @@ To use:
2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode. 2. Add the model to "whisper.swiftui.demo/Resources/models" via Xcode.
3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)). 3. Select a sample audio file (for example, [jfk.wav](https://github.com/ggerganov/whisper.cpp/raw/master/samples/jfk.wav)).
4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode. 4. Add the model to "whisper.swiftui.demo/Resources/samples" via Xcode.
5. Select the "Release" [^2] build configuration under "Run", then deploy and run to your device. 5. Select the "release" build configuration under "Run", then deploy and run to your device.
[^1]: I recommend the tiny, base or small models for running on an iOS device. [^1]: I recommend the tiny, base or small models for running on an iOS device.
[^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png) ![image](https://user-images.githubusercontent.com/1991296/212539216-0aef65e4-f882-480a-8358-0f816838fd52.png)

View File

@ -430,10 +430,6 @@
LLVM_LTO = YES; LLVM_LTO = YES;
MACOSX_DEPLOYMENT_TARGET = 13.0; MACOSX_DEPLOYMENT_TARGET = 13.0;
MARKETING_VERSION = 1.0; MARKETING_VERSION = 1.0;
OTHER_CFLAGS = (
"-O3",
"-DNDEBUG",
);
PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo; PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
PRODUCT_NAME = "$(TARGET_NAME)"; PRODUCT_NAME = "$(TARGET_NAME)";
SDKROOT = auto; SDKROOT = auto;

View File

@ -1,70 +0,0 @@
# Benchmark word-level timestamps for different models
#
# This script takes two arguments
# - an audio file
# - [optional] path to a font file
# I'm using "/usr/share/fonts/truetype/freefont/FreeMono.ttf" on Ubuntu
if [ -z "$1" ]; then
echo "Usage: $0 <audio file> [font file]"
exit 1
fi
#TODO: Make this a command line parameter
#models="base small large"
#models="tiny.en tiny base.en base small.en small medium.en medium large-v1 large"
models="tiny.en base.en small.en medium.en large"
DURATION=$(ffprobe -i $1 -show_entries format=duration -v quiet -of csv="p=0")
DURATION=$(printf "%.2f" $DURATION)
echo "Input file duration: ${DURATION}s"
for model in $models; do
echo "Running $model"
COMMAND="./main -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
if [ ! -z "$2" ]; then
COMMAND="$COMMAND -fp $2"
fi
#TODO: Surface errors better
# TIMEFMT is for zsh, TIMEFORMAT is for bash
EXECTIME=$({ TIMEFMT="%E";TIMEFORMAT=%E; time $COMMAND >/dev/null 2>&1; } 2>&1)
# Slightly different formats between zsh and bash
if [ "${EXECTIME: -1}" == "s" ]; then
EXECTIME=${EXECTIME::-1}
fi
RATIO=$(echo "$DURATION / $EXECTIME" | bc -l)
RATIO=$(printf "%.2f" $RATIO)
echo "Execution time: ${EXECTIME}s (${RATIO}x realtime)"
# If the file already exists, delete it
if [ -f $1.mp4 ]; then
rm $1.mp4
fi
bash $1.$model.wts >/dev/null 2>&1
mv $1.mp4 $1.$model.mp4
ffmpeg -y -f lavfi -i color=c=black:s=1200x50:d=$DURATION -vf "drawtext=fontfile=$2:fontsize=36:x=10:y=(h-text_h)/2:text='ggml-$model - ${EXECTIME}s (${RATIO}x realtime)':fontcolor=lightgrey" $1.$model.info.mp4 >/dev/null 2>&1
done
COMMAND="ffmpeg -y"
for model in $models; do
COMMAND="$COMMAND -i $1.$model.info.mp4 -i $1.$model.mp4"
done
COMMAND="$COMMAND -filter_complex \""
COUNT=0
for model in $models; do
COMMAND="$COMMAND[${COUNT}:v][$(($COUNT+1)):v]"
COUNT=$((COUNT+2))
done
COMMAND="$COMMAND vstack=inputs=${COUNT}[v]\" -map \"[v]\" -map 1:a $1.all.mp4 >/dev/null 2>&1"
echo $COMMAND
# Run the command
eval $COMMAND

2628
ggml.c

File diff suppressed because it is too large Load Diff

27
ggml.h
View File

@ -198,8 +198,6 @@ struct ggml_object;
struct ggml_context; struct ggml_context;
enum ggml_type { enum ggml_type {
GGML_TYPE_Q4_0,
GGML_TYPE_Q4_1,
GGML_TYPE_I8, GGML_TYPE_I8,
GGML_TYPE_I16, GGML_TYPE_I16,
GGML_TYPE_I32, GGML_TYPE_I32,
@ -228,9 +226,7 @@ enum ggml_op {
GGML_OP_STEP, GGML_OP_STEP,
GGML_OP_RELU, GGML_OP_RELU,
GGML_OP_GELU, GGML_OP_GELU,
GGML_OP_SILU,
GGML_OP_NORM, // normalize GGML_OP_NORM, // normalize
GGML_OP_RMS_NORM,
GGML_OP_MUL_MAT, GGML_OP_MUL_MAT,
@ -330,10 +326,7 @@ void ggml_print_objects(const struct ggml_context * ctx);
int ggml_nelements(const struct ggml_tensor * tensor); int ggml_nelements(const struct ggml_tensor * tensor);
size_t ggml_nbytes (const struct ggml_tensor * tensor); size_t ggml_nbytes (const struct ggml_tensor * tensor);
int ggml_blck_size (enum ggml_type type); size_t ggml_type_size (enum ggml_type type);
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
size_t ggml_element_size(const struct ggml_tensor * tensor); size_t ggml_element_size(const struct ggml_tensor * tensor);
struct ggml_context * ggml_init(struct ggml_init_params params); struct ggml_context * ggml_init(struct ggml_init_params params);
@ -343,9 +336,6 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
bool ggml_mlock_supported(void);
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
struct ggml_tensor * ggml_new_tensor( struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
@ -476,20 +466,12 @@ struct ggml_tensor * ggml_gelu(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_silu(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows // normalize along rows
// TODO: eps is hardcoded to 1e-5 for now // TODO: eps is hardcoded to 1e-5 for now
struct ggml_tensor * ggml_norm( struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_rms_norm(
struct ggml_context * ctx,
struct ggml_tensor * a);
// A: m rows, n columns // A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally) // B: p rows, n columns (i.e. we transpose it internally)
// result is m columns, p rows // result is m columns, p rows
@ -744,13 +726,6 @@ enum ggml_opt_result ggml_opt(
struct ggml_opt_params params, struct ggml_opt_params params,
struct ggml_tensor * f); struct ggml_tensor * f);
//
// quantization
//
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist);
// //
// system info // system info
// //

View File

@ -6,7 +6,7 @@ using the [convert-pt-to-ggml.py](convert-pt-to-ggml.py) script. You can either
the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh) the `ggml` files yourself using the conversion script, or you can use the [download-ggml-model.sh](download-ggml-model.sh)
script to download the already converted models. Currently, they are hosted on the following locations: script to download the already converted models. Currently, they are hosted on the following locations:
- https://huggingface.co/ggerganov/whisper.cpp - https://huggingface.co/datasets/ggerganov/whisper.cpp
- https://ggml.ggerganov.com - https://ggml.ggerganov.com
Sample usage: Sample usage:
@ -23,7 +23,7 @@ You can now use it like this:
A third option to obtain the model files is to download them from Hugging Face: A third option to obtain the model files is to download them from Hugging Face:
https://huggingface.co/ggerganov/whisper.cpp/tree/main https://huggingface.co/datasets/ggerganov/whisper.cpp/tree/main
## Available models ## Available models

View File

@ -79,11 +79,11 @@ dir_model = sys.argv[1]
dir_whisper = sys.argv[2] dir_whisper = sys.argv[2]
dir_out = sys.argv[3] dir_out = sys.argv[3]
with open(dir_model + "/vocab.json", "r", encoding="utf8") as f: with open(dir_model + "/vocab.json", "r") as f:
encoder = json.load(f) encoder = json.load(f)
with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f: with open(dir_model + "/added_tokens.json", "r") as f:
encoder_added = json.load(f) encoder_added = json.load(f)
with open(dir_model + "/config.json", "r", encoding="utf8") as f: with open(dir_model + "/config.json", "r") as f:
hparams = json.load(f) hparams = json.load(f)
model = WhisperForConditionalGeneration.from_pretrained(dir_model) model = WhisperForConditionalGeneration.from_pretrained(dir_model)

View File

@ -40,7 +40,7 @@ if exist "ggml-%model%.bin" (
goto :eof goto :eof
) )
PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin" PowerShell -NoProfile -ExecutionPolicy Bypass -Command "Invoke-WebRequest -Uri https://huggingface.co/datasets/ggerganov/whisper.cpp/resolve/main/ggml-%model%.bin -OutFile ggml-%model%.bin"
if %ERRORLEVEL% neq 0 ( if %ERRORLEVEL% neq 0 (
echo Failed to download ggml model %model% echo Failed to download ggml model %model%

View File

@ -6,7 +6,7 @@
#src="https://ggml.ggerganov.com" #src="https://ggml.ggerganov.com"
#pfx="ggml-model-whisper" #pfx="ggml-model-whisper"
src="https://huggingface.co/ggerganov/whisper.cpp" src="https://huggingface.co/datasets/ggerganov/whisper.cpp"
pfx="resolve/main/ggml" pfx="resolve/main/ggml"
# get the path of this script # get the path of this script

File diff suppressed because it is too large Load Diff

129
whisper.h
View File

@ -66,7 +66,6 @@ extern "C" {
// //
struct whisper_context; struct whisper_context;
struct whisper_state;
typedef int whisper_token; typedef int whisper_token;
@ -102,20 +101,11 @@ extern "C" {
WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size); WHISPER_API struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size);
WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader); WHISPER_API struct whisper_context * whisper_init(struct whisper_model_loader * loader);
// These are the same as the above, but the internal state of the context is not allocated automatically // Frees all memory allocated by the model.
// It is the responsibility of the caller to allocate the state using whisper_init_state() (#523)
WHISPER_API struct whisper_context * whisper_init_from_file_no_state(const char * path_model);
WHISPER_API struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t buffer_size);
WHISPER_API struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loader);
WHISPER_API struct whisper_state * whisper_init_state(struct whisper_context * ctx);
// Frees all allocated memory
WHISPER_API void whisper_free(struct whisper_context * ctx); WHISPER_API void whisper_free(struct whisper_context * ctx);
WHISPER_API void whisper_free_state(struct whisper_state * state);
// Convert RAW PCM audio to log mel spectrogram. // Convert RAW PCM audio to log mel spectrogram.
// The resulting spectrogram is stored inside the default state of the provided whisper context. // The resulting spectrogram is stored inside the provided whisper context.
// Returns 0 on success // Returns 0 on success
WHISPER_API int whisper_pcm_to_mel( WHISPER_API int whisper_pcm_to_mel(
struct whisper_context * ctx, struct whisper_context * ctx,
@ -123,15 +113,8 @@ extern "C" {
int n_samples, int n_samples,
int n_threads); int n_threads);
WHISPER_API int whisper_pcm_to_mel_with_state(
struct whisper_context * ctx,
struct whisper_state * state,
const float * samples,
int n_samples,
int n_threads);
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
// The resulting spectrogram is stored inside the default state of the provided whisper context. // The resulting spectrogram is stored inside the provided whisper context.
// Returns 0 on success // Returns 0 on success
WHISPER_API int whisper_pcm_to_mel_phase_vocoder( WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
struct whisper_context* ctx, struct whisper_context* ctx,
@ -139,14 +122,8 @@ extern "C" {
int n_samples, int n_samples,
int n_threads); int n_threads);
WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
struct whisper_context * ctx,
struct whisper_state * state,
const float * samples,
int n_samples,
int n_threads);
// This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context. // This can be used to set a custom log mel spectrogram inside the provided whisper context.
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram. // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
// n_mel must be 80 // n_mel must be 80
// Returns 0 on success // Returns 0 on success
@ -156,14 +133,7 @@ extern "C" {
int n_len, int n_len,
int n_mel); int n_mel);
WHISPER_API int whisper_set_mel_with_state( // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
struct whisper_context * ctx,
struct whisper_state * state,
const float * data,
int n_len,
int n_mel);
// Run the Whisper encoder on the log mel spectrogram stored inside the default state in the provided whisper context.
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first. // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
// offset can be used to specify the offset of the first frame in the spectrogram. // offset can be used to specify the offset of the first frame in the spectrogram.
// Returns 0 on success // Returns 0 on success
@ -172,12 +142,6 @@ extern "C" {
int offset, int offset,
int n_threads); int n_threads);
WHISPER_API int whisper_encode_with_state(
struct whisper_context * ctx,
struct whisper_state * state,
int offset,
int n_threads);
// Run the Whisper decoder to obtain the logits and probabilities for the next token. // Run the Whisper decoder to obtain the logits and probabilities for the next token.
// Make sure to call whisper_encode() first. // Make sure to call whisper_encode() first.
// tokens + n_tokens is the provided context for the decoder. // tokens + n_tokens is the provided context for the decoder.
@ -191,14 +155,6 @@ extern "C" {
int n_past, int n_past,
int n_threads); int n_threads);
WHISPER_API int whisper_decode_with_state(
struct whisper_context * ctx,
struct whisper_state * state,
const whisper_token * tokens,
int n_tokens,
int n_past,
int n_threads);
// Convert the provided text into tokens. // Convert the provided text into tokens.
// The tokens pointer must be large enough to hold the resulting tokens. // The tokens pointer must be large enough to hold the resulting tokens.
// Returns the number of tokens on success, no more than n_max_tokens // Returns the number of tokens on success, no more than n_max_tokens
@ -234,44 +190,20 @@ extern "C" {
int n_threads, int n_threads,
float * lang_probs); float * lang_probs);
WHISPER_API int whisper_lang_auto_detect_with_state(
struct whisper_context * ctx,
struct whisper_state * state,
int offset_ms,
int n_threads,
float * lang_probs);
WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
WHISPER_API int whisper_n_len_from_state(struct whisper_state * state); // mel length
WHISPER_API int whisper_n_vocab (struct whisper_context * ctx); WHISPER_API int whisper_n_vocab (struct whisper_context * ctx);
WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx); WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx); WHISPER_API int whisper_n_audio_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx); WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
WHISPER_API int whisper_model_n_vocab (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_audio_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
WHISPER_API int whisper_model_n_text_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
WHISPER_API int whisper_model_f16 (struct whisper_context * ctx);
WHISPER_API int whisper_model_type (struct whisper_context * ctx);
// Token logits obtained from the last call to whisper_decode() // Token logits obtained from the last call to whisper_decode()
// The logits for the last token are stored in the last row // The logits for the last token are stored in the last row
// Rows: n_tokens // Rows: n_tokens
// Cols: n_vocab // Cols: n_vocab
WHISPER_API float * whisper_get_logits(struct whisper_context * ctx); WHISPER_API float * whisper_get_logits(struct whisper_context * ctx);
WHISPER_API float * whisper_get_logits_from_state(struct whisper_state * state);
// Token Id -> String. Uses the vocabulary in the provided context // Token Id -> String. Uses the vocabulary in the provided context
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token); WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
// Special tokens // Special tokens
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx); WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
@ -286,7 +218,7 @@ extern "C" {
WHISPER_API whisper_token whisper_token_translate (void); WHISPER_API whisper_token whisper_token_translate (void);
WHISPER_API whisper_token whisper_token_transcribe(void); WHISPER_API whisper_token whisper_token_transcribe(void);
// Performance information from the default state. // Performance information
WHISPER_API void whisper_print_timings(struct whisper_context * ctx); WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx); WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
@ -304,19 +236,18 @@ extern "C" {
// Text segment callback // Text segment callback
// Called on every newly generated text segment // Called on every newly generated text segment
// Use the whisper_full_...() functions to obtain the text segments // Use the whisper_full_...() functions to obtain the text segments
typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, struct whisper_state * state, int n_new, void * user_data); typedef void (*whisper_new_segment_callback)(struct whisper_context * ctx, int n_new, void * user_data);
// Encoder begin callback // Encoder begin callback
// If not NULL, called before the encoder starts // If not NULL, called before the encoder starts
// If it returns false, the computation is aborted // If it returns false, the computation is aborted
typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, struct whisper_state * state, void * user_data); typedef bool (*whisper_encoder_begin_callback)(struct whisper_context * ctx, void * user_data);
// Logits filter callback // Logits filter callback
// Can be used to modify the logits before sampling // Can be used to modify the logits before sampling
// If not NULL, called after applying temperature to logits // If not NULL, called after applying temperature to logits
typedef void (*whisper_logits_filter_callback)( typedef void (*whisper_logits_filter_callback)(
struct whisper_context * ctx, struct whisper_context * ctx,
struct whisper_state * state,
const whisper_token_data * tokens, const whisper_token_data * tokens,
int n_tokens, int n_tokens,
float * logits, float * logits,
@ -403,7 +334,6 @@ extern "C" {
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy); WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy);
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
// Not thread safe for same context
// Uses the specified decoding strategy to obtain the text. // Uses the specified decoding strategy to obtain the text.
WHISPER_API int whisper_full( WHISPER_API int whisper_full(
struct whisper_context * ctx, struct whisper_context * ctx,
@ -411,16 +341,7 @@ extern "C" {
const float * samples, const float * samples,
int n_samples); int n_samples);
WHISPER_API int whisper_full_with_state( // Split the input audio in chunks and process each chunk separately using whisper_full()
struct whisper_context * ctx,
struct whisper_state * state,
struct whisper_full_params params,
const float * samples,
int n_samples);
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
// Result is stored in the default state of the context
// Not thread safe if executed in parallel on the same context.
// It seems this approach can offer some speedup in some cases. // It seems this approach can offer some speedup in some cases.
// However, the transcription accuracy can be worse at the beginning and end of each chunk. // However, the transcription accuracy can be worse at the beginning and end of each chunk.
WHISPER_API int whisper_full_parallel( WHISPER_API int whisper_full_parallel(
@ -430,56 +351,40 @@ extern "C" {
int n_samples, int n_samples,
int n_processors); int n_processors);
// Number of generated text segments // Number of generated text segments.
// A segment can be a few words, a sentence, or even a paragraph. // A segment can be a few words, a sentence, or even a paragraph.
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx); WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
WHISPER_API int whisper_full_n_segments_from_state(struct whisper_state * state);
// Language id associated with the context's default state // Language id associated with the current context
WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx); WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
// Language id associated with the provided state // Get the start and end time of the specified segment.
WHISPER_API int whisper_full_lang_id_from_state(struct whisper_state * state);
// Get the start and end time of the specified segment
WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment); WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
WHISPER_API int64_t whisper_full_get_segment_t0_from_state(struct whisper_state * state, int i_segment);
WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment); WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
WHISPER_API int64_t whisper_full_get_segment_t1_from_state(struct whisper_state * state, int i_segment);
// Get the text of the specified segment // Get the text of the specified segment.
WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment); WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
WHISPER_API const char * whisper_full_get_segment_text_from_state(struct whisper_state * state, int i_segment);
// Get number of tokens in the specified segment // Get number of tokens in the specified segment.
WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment); WHISPER_API int whisper_full_n_tokens(struct whisper_context * ctx, int i_segment);
WHISPER_API int whisper_full_n_tokens_from_state(struct whisper_state * state, int i_segment);
// Get the token text of the specified token in the specified segment // Get the token text of the specified token in the specified segment.
WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token); WHISPER_API const char * whisper_full_get_token_text(struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API const char * whisper_full_get_token_text_from_state(struct whisper_context * ctx, struct whisper_state * state, int i_segment, int i_token);
WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token); WHISPER_API whisper_token whisper_full_get_token_id (struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API whisper_token whisper_full_get_token_id_from_state(struct whisper_state * state, int i_segment, int i_token);
// Get token data for the specified token in the specified segment // Get token data for the specified token in the specified segment.
// This contains probabilities, timestamps, etc. // This contains probabilities, timestamps, etc.
WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token); WHISPER_API whisper_token_data whisper_full_get_token_data(struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API whisper_token_data whisper_full_get_token_data_from_state(struct whisper_state * state, int i_segment, int i_token);
// Get the probability of the specified token in the specified segment // Get the probability of the specified token in the specified segment.
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token); WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
WHISPER_API float whisper_full_get_token_p_from_state(struct whisper_state * state, int i_segment, int i_token);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Temporary helpers needed for exposing ggml interface // Temporary helpers needed for exposing ggml interface
WHISPER_API int whisper_bench_memcpy(int n_threads); WHISPER_API int whisper_bench_memcpy(int n_threads);
WHISPER_API const char * whisper_bench_memcpy_str(int n_threads);
WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads); WHISPER_API int whisper_bench_ggml_mul_mat(int n_threads);
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
#ifdef __cplusplus #ifdef __cplusplus
} }