forked from extern/whisper.cpp
8de452c18b
* whisper : prepare infra for new decoding strategies * whisper : apply logit filters and compute logprobs * whisper : add whisper_get_logits() * whisper : separate self and cross attention memory Initial step needed for supporting parallel decoders * whisper : move probs_id buffer to whisper_context * whisper : refactor kv cache into separate struct * whisper : move self-attention kv cache to whisper_decoder * whisper : wip decoding parameters + strategies * whisper : wip decoding parameters + strategies (part 2) * whisper : wip decoding parameters + strategies (part 3) * whisper : wip decoding parameters + strategies (part 4) * whisper : fix prompt_past update to not include prompt_init * whisper : temperature + best_of support * whisper : support for compression_ration_threshold We actually use entropy, but it is similar * command : fix example to use logits instead of obsolete probs * whisper : handle empty sequence ranking * whisper : add WHISPER_DEBUG + diagnostic prints + new main args * whisper : minor fixes * whisper : add beam-search support * whisper : bug fix when there no previous context * whisper : add comments * stream : disable temperature fallback For real-time processing, we always want a single decoder running at T=0 * whisper.swiftui : update example - fix paths + add empty folders
217 lines
6.0 KiB
C++
217 lines
6.0 KiB
C++
#include "ggml.h"
|
|
#include "whisper.h"
|
|
|
|
#include <emscripten.h>
|
|
#include <emscripten/bind.h>
|
|
|
|
#include <atomic>
|
|
#include <cmath>
|
|
#include <mutex>
|
|
#include <string>
|
|
#include <thread>
|
|
#include <vector>
|
|
|
|
constexpr int N_THREAD = 8;
|
|
|
|
std::vector<struct whisper_context *> g_contexts(4, nullptr);
|
|
|
|
std::mutex g_mutex;
|
|
std::thread g_worker;
|
|
|
|
std::atomic<bool> g_running(false);
|
|
|
|
std::string g_status = "";
|
|
std::string g_status_forced = "";
|
|
std::string g_transcribed = "";
|
|
|
|
std::vector<float> g_pcmf32;
|
|
|
|
void stream_set_status(const std::string & status) {
|
|
std::lock_guard<std::mutex> lock(g_mutex);
|
|
g_status = status;
|
|
}
|
|
|
|
void stream_main(size_t index) {
|
|
stream_set_status("loading data ...");
|
|
|
|
struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
|
|
|
wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
|
|
wparams.offset_ms = 0;
|
|
wparams.translate = false;
|
|
wparams.no_context = true;
|
|
wparams.single_segment = true;
|
|
wparams.print_realtime = false;
|
|
wparams.print_progress = false;
|
|
wparams.print_timestamps = true;
|
|
wparams.print_special = false;
|
|
|
|
wparams.max_tokens = 32;
|
|
wparams.audio_ctx = 768; // partial encoder context for better performance
|
|
|
|
// disable temperature fallback
|
|
wparams.temperature_inc = -1.0f;
|
|
|
|
wparams.language = "en";
|
|
|
|
printf("stream: using %d threads\n", wparams.n_threads);
|
|
|
|
std::vector<float> pcmf32;
|
|
|
|
// whisper context
|
|
auto & ctx = g_contexts[index];
|
|
|
|
// 5 seconds interval
|
|
const int64_t window_samples = 5*WHISPER_SAMPLE_RATE;
|
|
|
|
while (g_running) {
|
|
stream_set_status("waiting for audio ...");
|
|
|
|
{
|
|
std::unique_lock<std::mutex> lock(g_mutex);
|
|
|
|
if (g_pcmf32.size() < 1024) {
|
|
lock.unlock();
|
|
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(10));
|
|
|
|
continue;
|
|
}
|
|
|
|
pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
|
|
g_pcmf32.clear();
|
|
}
|
|
|
|
{
|
|
const auto t_start = std::chrono::high_resolution_clock::now();
|
|
|
|
stream_set_status("running whisper ...");
|
|
|
|
int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
|
|
if (ret != 0) {
|
|
printf("whisper_full() failed: %d\n", ret);
|
|
break;
|
|
}
|
|
|
|
const auto t_end = std::chrono::high_resolution_clock::now();
|
|
|
|
printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
|
|
}
|
|
|
|
{
|
|
std::string text_heard;
|
|
|
|
{
|
|
const int n_segments = whisper_full_n_segments(ctx);
|
|
for (int i = n_segments - 1; i < n_segments; ++i) {
|
|
const char * text = whisper_full_get_segment_text(ctx, i);
|
|
|
|
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
|
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
|
|
|
printf("transcribed: %s\n", text);
|
|
|
|
text_heard += text;
|
|
}
|
|
}
|
|
|
|
{
|
|
std::lock_guard<std::mutex> lock(g_mutex);
|
|
g_transcribed = text_heard;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (index < g_contexts.size()) {
|
|
whisper_free(g_contexts[index]);
|
|
g_contexts[index] = nullptr;
|
|
}
|
|
}
|
|
|
|
EMSCRIPTEN_BINDINGS(stream) {
|
|
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
|
|
for (size_t i = 0; i < g_contexts.size(); ++i) {
|
|
if (g_contexts[i] == nullptr) {
|
|
g_contexts[i] = whisper_init_from_file(path_model.c_str());
|
|
if (g_contexts[i] != nullptr) {
|
|
g_running = true;
|
|
if (g_worker.joinable()) {
|
|
g_worker.join();
|
|
}
|
|
g_worker = std::thread([i]() {
|
|
stream_main(i);
|
|
});
|
|
|
|
return i + 1;
|
|
} else {
|
|
return (size_t) 0;
|
|
}
|
|
}
|
|
}
|
|
|
|
return (size_t) 0;
|
|
}));
|
|
|
|
emscripten::function("free", emscripten::optional_override([](size_t index) {
|
|
if (g_running) {
|
|
g_running = false;
|
|
}
|
|
}));
|
|
|
|
emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
|
|
--index;
|
|
|
|
if (index >= g_contexts.size()) {
|
|
return -1;
|
|
}
|
|
|
|
if (g_contexts[index] == nullptr) {
|
|
return -2;
|
|
}
|
|
|
|
{
|
|
std::lock_guard<std::mutex> lock(g_mutex);
|
|
const int n = audio["length"].as<int>();
|
|
|
|
emscripten::val heap = emscripten::val::module_property("HEAPU8");
|
|
emscripten::val memory = heap["buffer"];
|
|
|
|
g_pcmf32.resize(n);
|
|
|
|
emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
|
|
memoryView.call<void>("set", audio);
|
|
}
|
|
|
|
return 0;
|
|
}));
|
|
|
|
emscripten::function("get_transcribed", emscripten::optional_override([]() {
|
|
std::string transcribed;
|
|
|
|
{
|
|
std::lock_guard<std::mutex> lock(g_mutex);
|
|
transcribed = std::move(g_transcribed);
|
|
}
|
|
|
|
return transcribed;
|
|
}));
|
|
|
|
emscripten::function("get_status", emscripten::optional_override([]() {
|
|
std::string status;
|
|
|
|
{
|
|
std::lock_guard<std::mutex> lock(g_mutex);
|
|
status = g_status_forced.empty() ? g_status : g_status_forced;
|
|
}
|
|
|
|
return status;
|
|
}));
|
|
|
|
emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
|
|
{
|
|
std::lock_guard<std::mutex> lock(g_mutex);
|
|
g_status_forced = status;
|
|
}
|
|
}));
|
|
}
|