Compare commits

..

11 Commits

Author SHA1 Message Date
59c997ca2d wip ignore 2023-02-15 19:11:12 +02:00
459753342d yt-wsp.sh : add unique filename generation (#495)
Co-authored-by: genevera <genevera@noreply.users.github.com>
2023-02-14 20:12:51 +02:00
9764782bd9 readme : add another .NET repo (#303) 2023-02-14 20:04:03 +02:00
3b010f9bed readme : add .NET repo (#303) 2023-02-11 17:35:33 +02:00
113fcec513 cmake : install whisper.h header (#485)
Including the header file in the install bundle helps projects that ship binaries.
2023-02-11 09:13:32 +02:00
cfc06bf8df whisper : suppress non-speech-related token outputs (#473)
* add non-speech-token suppression

* add suppress non-speech_tokens param
2023-02-08 09:05:34 +02:00
2bfe0ebc0f whisper : fixed Beam Search Strategy and exposed whisper_pcm_to_mel_phase_vocoder (#474)
Co-authored-by: Sandro Hanea <sandrohanea@microsoft.com>
2023-02-08 09:01:47 +02:00
4dd7119deb whisper : only trim if split_on_word is true (#476) 2023-02-08 08:43:23 +02:00
ab1916fc59 ci : add node addon test and optimize compilation configuration (#468)
* addon: implement node addon call whisper through cpp

* addon: modify the license to MIT

* addon: remove iostream

* addon: rename dir

* addon: fix typo

* addon: configure cmake to build when cmake-js is used

* ci: add addon.node test ci

* addon: remove build WHISPER_BUILD_TESTS

* addon: update build command

* addon: add test

* addon: add test file

* addon: adapt to compile on Windows

* addon: fix typo

* addon: reuse jfk.wav

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* addon: reuse jfk.wav

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2023-02-05 15:02:08 +02:00
a1c1583cc7 whisper : add whisper_full_lang_id() for getting the context lang (#461) 2023-02-05 14:46:26 +02:00
d012b5c7e4 whisper : add "split_on_word" flag when using using "max_len" option (#455)
* Update whisper.cpp

* fix: trim function

* feat: added flag to split on word

* fix: arguments for main
2023-02-05 14:44:23 +02:00
14 changed files with 959 additions and 84 deletions

48
.github/workflows/examples.yml vendored Normal file
View File

@ -0,0 +1,48 @@
name: Examples Tests
on:
push:
paths:
- examples/addon.node/**
- whisper.h
pull_request:
paths:
- examples/addon.node/**
- whisper.h
jobs:
addon_node-ubuntu-latest:
runs-on: ubuntu-latest
strategy:
matrix:
node-version: [ 16.x, 18.x ]
steps:
- name: Clone
uses: actions/checkout@v1
- name: Dependencies
run: |
sudo apt-get update
sudo apt-get install build-essential
sudo apt-get install cmake
sudo apt-get install libsdl2-dev
- name: Use Node.js ${{ matrix.node-version }}
uses: actions/setup-node@v1
with:
node-version: ${{ matrix.node-version }}
cache: 'npm'
- name: Install package.json dependencies
working-directory: ./examples/addon.node
run: npm install
- name: Compile addon.node
run: npx cmake-js compile -T whisper-addon -B Release
- name: Download test model
run: |
bash ./models/download-ggml-model.sh base.en
- name: Test
run: |
cd examples/addon.node
npm run test

View File

@ -226,10 +226,13 @@ target_compile_definitions(${TARGET} PUBLIC
${WHISPER_EXTRA_FLAGS}
)
set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")
install(TARGETS ${TARGET}
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib/static
RUNTIME DESTINATION bin
PUBLIC_HEADER DESTINATION include
)
#
@ -242,7 +245,7 @@ add_subdirectory(bindings)
# programs, examples and tests
#
if (WHISPER_BUILD_TESTS)
if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
enable_testing()
add_subdirectory(tests)
endif ()

View File

@ -465,6 +465,9 @@ in [models](models).
- [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
- [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
- [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
- [X] .NET:
- [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
- [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)
## Examples

View File

@ -24,3 +24,8 @@ target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
#==================================================================
target_link_libraries(${TARGET} ${CMAKE_JS_LIB} whisper ${CMAKE_THREAD_LIBS_INIT})
if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
# Generate node.lib
execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
endif()

View File

@ -14,14 +14,14 @@ npm install
Make sure it is in the project root directory and compiled with make-js.
```shell
npx cmake-js compile -T whisper-addon
npx cmake-js compile -T whisper-addon -B Release
```
For Electron addon and cmake-js options, you can see [cmake-js](https://github.com/cmake-js/cmake-js) and make very few configuration changes.
> Such as appointing special cmake path:
> ```shell
> npx cmake-js compile -c 'xxx/cmake' -T whisper-addon
> npx cmake-js compile -c 'xxx/cmake' -T whisper-addon -B Release
> ```
## Run

View File

@ -0,0 +1,15 @@
const path = require('path');
const { whisper } = require(path.join(__dirname, '../../../build/Release/whisper-addon'));
const whisperParamsMock = {
language: 'en',
model: path.join(__dirname, '../../../models/ggml-base.en.bin'),
fname_inp: path.join(__dirname, '../../../samples/jfk.wav'),
};
describe("Run whisper.node", () => {
test("it should receive a non-empty value", () => {
expect(whisper(whisperParamsMock).length).toBeGreaterThan(0);
});
});

View File

@ -1,3 +1,4 @@
#include <cstdint>
#include <string>
#include <thread>
#include <vector>
@ -398,9 +399,9 @@ Napi::Object whisper(const Napi::CallbackInfo& info) {
}
Napi::Object res = Napi::Array::New(env, result.size());
for (u_int32_t i = 0; i < result.size(); ++i) {
for (uint64_t i = 0; i < result.size(); ++i) {
Napi::Object tmp = Napi::Array::New(env, 3);
for (u_int32_t j = 0; j < 3; ++j) {
for (uint64_t j = 0; j < 3; ++j) {
tmp[j] = Napi::String::New(env, result[i][j]);
}
res[i] = tmp;

View File

@ -5,8 +5,12 @@
"main": "index.js",
"author": "Qanhe Chen",
"license": "MIT",
"scripts": {
"test": "jest"
},
"devDependencies": {
"cmake-js": "^7.1.1",
"jest": "^29.4.0",
"node-addon-api": "^5.0.0"
}
}

View File

@ -0,0 +1,10 @@
if (WHISPER_SUPPORT_SDL2)
# chess
set(TARGET chess)
add_executable(${TARGET} chess.cpp)
include(DefaultTargetOptions)
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
target_link_libraries(${TARGET} PRIVATE common whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
endif ()

634
examples/chess/chess.cpp Normal file
View File

@ -0,0 +1,634 @@
// Input chess moves via voice
//
#include "common.h"
#include "whisper.h"
#include <SDL.h>
#include <SDL_audio.h>
#include <atomic>
#include <cassert>
#include <cstdio>
#include <string>
#include <thread>
#include <vector>
#include <fstream>
#include <mutex>
// 500 -> 00:05.000
// 6000 -> 01:00.000
std::string to_timestamp(int64_t t) {
int64_t sec = t/100;
int64_t msec = t - sec*100;
int64_t min = sec/60;
sec = sec - min*60;
char buf[32];
snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
return std::string(buf);
}
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t step_ms = 3000;
int32_t length_ms = 10000;
int32_t keep_ms = 200;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
bool translate = false;
bool print_special = false;
bool no_context = true;
bool no_timestamps = false;
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
std::string fname_inp;
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if ( arg == "--step") { params.step_ms = std::stoi(argv[++i]); }
else if ( arg == "--length") { params.length_ms = std::stoi(argv[++i]); }
else if ( arg == "--keep") { params.keep_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-kc" || arg == "--keep-context") { params.no_context = false; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_inp = argv[++i]; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " --step N [%-7d] audio step size in milliseconds\n", params.step_ms);
fprintf(stderr, " --length N [%-7d] audio length in milliseconds\n", params.length_ms);
fprintf(stderr, " --keep N [%-7d] audio to keep from previous step in ms\n", params.keep_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -kc, --keep-context [%-7s] keep context between audio chunks\n", params.no_context ? "false" : "true");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", params.fname_inp.c_str());
fprintf(stderr, "\n");
}
//
// SDL Audio capture
//
class audio_async {
public:
audio_async(int len_ms);
~audio_async();
bool init(int capture_id, int sample_rate);
// start capturing audio via the provided SDL callback
// keep last len_ms seconds of audio in a circular buffer
bool resume();
bool pause();
bool clear();
// callback to be called by SDL
void callback(uint8_t * stream, int len);
// get audio data from the circular buffer
void get(int ms, std::vector<float> & audio);
private:
SDL_AudioDeviceID m_dev_id_in = 0;
int m_len_ms = 0;
int m_sample_rate = 0;
std::atomic_bool m_running;
std::mutex m_mutex;
std::vector<float> m_audio;
std::vector<float> m_audio_new;
size_t m_audio_pos = 0;
size_t m_audio_len = 0;
};
audio_async::audio_async(int len_ms) {
m_len_ms = len_ms;
m_running = false;
}
audio_async::~audio_async() {
if (m_dev_id_in) {
SDL_CloseAudioDevice(m_dev_id_in);
}
}
bool audio_async::init(int capture_id, int sample_rate) {
SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
if (SDL_Init(SDL_INIT_AUDIO) < 0) {
SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
return false;
}
SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
{
int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
for (int i = 0; i < nDevices; i++) {
fprintf(stderr, "%s: - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
}
}
SDL_AudioSpec capture_spec_requested;
SDL_AudioSpec capture_spec_obtained;
SDL_zero(capture_spec_requested);
SDL_zero(capture_spec_obtained);
capture_spec_requested.freq = sample_rate;
capture_spec_requested.format = AUDIO_F32;
capture_spec_requested.channels = 1;
capture_spec_requested.samples = 1024;
capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
audio_async * audio = (audio_async *) userdata;
audio->callback(stream, len);
};
capture_spec_requested.userdata = this;
if (capture_id >= 0) {
fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
} else {
fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
}
if (!m_dev_id_in) {
fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
m_dev_id_in = 0;
return false;
} else {
fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
fprintf(stderr, "%s: - sample rate: %d\n", __func__, capture_spec_obtained.freq);
fprintf(stderr, "%s: - format: %d (required: %d)\n", __func__, capture_spec_obtained.format,
capture_spec_requested.format);
fprintf(stderr, "%s: - channels: %d (required: %d)\n", __func__, capture_spec_obtained.channels,
capture_spec_requested.channels);
fprintf(stderr, "%s: - samples per frame: %d\n", __func__, capture_spec_obtained.samples);
}
m_sample_rate = capture_spec_obtained.freq;
m_audio.resize((m_sample_rate*m_len_ms)/1000);
return true;
}
bool audio_async::resume() {
if (!m_dev_id_in) {
fprintf(stderr, "%s: no audio device to resume!\n", __func__);
return false;
}
if (m_running) {
fprintf(stderr, "%s: already running!\n", __func__);
return false;
}
SDL_PauseAudioDevice(m_dev_id_in, 0);
m_running = true;
return true;
}
bool audio_async::pause() {
if (!m_dev_id_in) {
fprintf(stderr, "%s: no audio device to pause!\n", __func__);
return false;
}
if (!m_running) {
fprintf(stderr, "%s: already paused!\n", __func__);
return false;
}
SDL_PauseAudioDevice(m_dev_id_in, 1);
m_running = false;
return true;
}
bool audio_async::clear() {
if (!m_dev_id_in) {
fprintf(stderr, "%s: no audio device to clear!\n", __func__);
return false;
}
if (!m_running) {
fprintf(stderr, "%s: not running!\n", __func__);
return false;
}
{
std::lock_guard<std::mutex> lock(m_mutex);
m_audio_pos = 0;
m_audio_len = 0;
}
return true;
}
// callback to be called by SDL
void audio_async::callback(uint8_t * stream, int len) {
if (!m_running) {
return;
}
const size_t n_samples = len / sizeof(float);
m_audio_new.resize(n_samples);
memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
//fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
{
std::lock_guard<std::mutex> lock(m_mutex);
if (m_audio_pos + n_samples > m_audio.size()) {
const size_t n0 = m_audio.size() - m_audio_pos;
memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
m_audio_len = m_audio.size();
} else {
memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
}
}
}
void audio_async::get(int ms, std::vector<float> & result) {
if (!m_dev_id_in) {
fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
return;
}
if (!m_running) {
fprintf(stderr, "%s: not running!\n", __func__);
return;
}
result.clear();
{
std::lock_guard<std::mutex> lock(m_mutex);
if (ms <= 0) {
ms = m_len_ms;
}
size_t n_samples = (m_sample_rate * ms) / 1000;
if (n_samples > m_audio_len) {
n_samples = m_audio_len;
}
result.resize(n_samples);
int s0 = m_audio_pos - n_samples;
if (s0 < 0) {
s0 += m_audio.size();
}
if (s0 + n_samples > m_audio.size()) {
const size_t n0 = m_audio.size() - s0;
memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
} else {
memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
}
}
}
///////////////////////////
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
params.keep_ms = std::min(params.keep_ms, params.step_ms);
params.length_ms = std::max(params.length_ms, params.step_ms);
const int n_samples_step = (1e-3*params.step_ms )*WHISPER_SAMPLE_RATE;
const int n_samples_len = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
const int n_samples_keep = (1e-3*params.keep_ms )*WHISPER_SAMPLE_RATE;
const int n_samples_30s = (1e-3*30000.0 )*WHISPER_SAMPLE_RATE;
const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
params.no_timestamps = !use_vad;
params.no_context |= use_vad;
params.max_tokens = 0;
// init audio
audio_async audio(params.length_ms);
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
audio.resume();
// whisper init
if (whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
std::vector<float> pcmf32 (n_samples_30s, 0.0f);
std::vector<float> pcmf32_old;
std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
std::vector<whisper_token> prompt_tokens;
// print some info about the processing
{
fprintf(stderr, "\n");
if (!whisper_is_multilingual(ctx)) {
if (params.language != "en" || params.translate) {
params.language = "en";
params.translate = false;
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
}
}
fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
__func__,
n_samples_step,
float(n_samples_step)/WHISPER_SAMPLE_RATE,
float(n_samples_len )/WHISPER_SAMPLE_RATE,
float(n_samples_keep)/WHISPER_SAMPLE_RATE,
params.n_threads,
params.language.c_str(),
params.translate ? "translate" : "transcribe",
params.no_timestamps ? 0 : 1);
if (!use_vad) {
fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
} else {
fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
}
fprintf(stderr, "\n");
}
int n_iter = 0;
bool is_running = true;
printf("[Start speaking]");
fflush(stdout);
auto t_last = std::chrono::high_resolution_clock::now();
const auto t_start = t_last;
// main audio loop
while (is_running) {
// handle Ctrl + C
{
SDL_Event event;
while (SDL_PollEvent(&event)) {
switch (event.type) {
case SDL_QUIT:
{
is_running = false;
} break;
default:
break;
}
}
if (!is_running) {
break;
}
}
if (!is_running) {
break;
}
// process new audio
if (!use_vad) {
while (true) {
audio.get(params.step_ms, pcmf32_new);
if ((int) pcmf32_new.size() > 2*n_samples_step) {
fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
audio.clear();
continue;
}
if ((int) pcmf32_new.size() >= n_samples_step) {
audio.clear();
break;
}
SDL_Delay(1);
}
const int n_samples_new = pcmf32_new.size();
// take up to params.length_ms audio from previous iteration
const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
//printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
pcmf32.resize(n_samples_new + n_samples_take);
for (int i = 0; i < n_samples_take; i++) {
pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
}
memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
pcmf32_old = pcmf32;
} else {
const auto t_now = std::chrono::high_resolution_clock::now();
const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
if (t_diff < 2000) {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
continue;
}
audio.get(2000, pcmf32_new);
if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
audio.get(params.length_ms, pcmf32);
} else {
std::this_thread::sleep_for(std::chrono::milliseconds(100));
continue;
}
t_last = t_now;
}
// run the inference
{
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = !params.no_timestamps;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.single_segment = !use_vad;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
// disable temperature fallback
wparams.temperature_inc = -1.0f;
wparams.prompt_tokens = params.no_context ? nullptr : prompt_tokens.data();
wparams.prompt_n_tokens = params.no_context ? 0 : prompt_tokens.size();
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
return 6;
}
// print result;
{
if (!use_vad) {
printf("\33[2K\r");
// print long empty line to clear the previous line
printf("%s", std::string(100, ' ').c_str());
printf("\33[2K\r");
} else {
const int64_t t1 = (t_last - t_start).count()/1000000;
const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
printf("\n");
printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
printf("\n");
}
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i);
if (params.no_timestamps) {
printf("%s", text);
fflush(stdout);
} else {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
}
}
if (use_vad){
printf("\n");
printf("### Transcription %d END\n", n_iter);
}
}
++n_iter;
if (!use_vad && (n_iter % n_new_line) == 0) {
printf("\n");
// keep part of the audio for next iteration to try to mitigate word boundary issues
pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
// Add tokens of the last full length segment as the prompt
if (!params.no_context) {
prompt_tokens.clear();
const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) {
const int token_count = whisper_full_n_tokens(ctx, i);
for (int j = 0; j < token_count; ++j) {
prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
}
}
}
}
}
}
audio.pause();
whisper_print_timings(ctx);
whisper_free(ctx);
return 0;
}

View File

@ -69,6 +69,7 @@ struct whisper_params {
bool speed_up = false;
bool translate = false;
bool diarize = false;
bool split_on_word = false;
bool no_fallback = false;
bool output_txt = false;
bool output_vtt = false;
@ -118,6 +119,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
else if (arg == "-sow" || arg == "--split-on-word") { params.split_on_word = true; }
else if (arg == "-nf" || arg == "--no-fallback") { params.no_fallback = true; }
else if (arg == "-otxt" || arg == "--output-txt") { params.output_txt = true; }
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
@ -156,6 +158,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -d N, --duration N [%-7d] duration of audio to process in milliseconds\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N [%-7d] maximum number of text context tokens to store\n", params.max_context);
fprintf(stderr, " -ml N, --max-len N [%-7d] maximum segment length in characters\n", params.max_len);
fprintf(stderr, " -sow, --split-on-word [%-7s] split on word rather than on token\n", params.split_on_word ? "true" : "false");
fprintf(stderr, " -bo N, --best-of N [%-7d] number of best candidates to keep\n", params.best_of);
fprintf(stderr, " -bs N, --beam-size N [%-7d] beam size for beam search\n", params.beam_size);
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
@ -651,6 +654,7 @@ int main(int argc, char ** argv) {
wparams.token_timestamps = params.output_wts || params.max_len > 0;
wparams.thold_pt = params.word_thold;
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.split_on_word = params.split_on_word;
wparams.speed_up = params.speed_up;

View File

@ -1,20 +1,10 @@
#!/usr/bin/env bash
# Small shell script to more easily automatically download and transcribe live stream VODs.
# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
# Use `./examples/yt-wsp.sh help` to print help info.
#
# Sample usage:
#
# git clone https://github.com/ggerganov/whisper.cpp
# cd whisper.cpp
# make
# ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
#
# shellcheck disable=2086
# MIT License
# Copyright (c) 2022 Daniils Petrovs
# Copyright (c) 2023 Jennifer Capasso
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
@ -34,114 +24,181 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Small shell script to more easily automatically download and transcribe live stream VODs.
# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
# Use `./examples/yt-wsp.sh help` to print help info.
#
# Sample usage:
#
# git clone https://github.com/ggerganov/whisper.cpp
# cd whisper.cpp
# make
# ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
#
set -Eeuo pipefail
# You can find how to download models in the OG repo: https://github.com/ggerganov/whisper.cpp/#usage
MODEL_PATH="${MODEL_PATH:-models/ggml-base.en.bin}" # Set to a multilingual model if you want to translate from foreign lang to en
WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-whisper}" # Where to find the whisper.cpp executable
WHISPER_LANG="${WHISPER_LANG:-en}" # Set to desired lang to translate from
# get script file location
SCRIPT_PATH="$(realpath -e ${BASH_SOURCE[0]})";
SCRIPT_DIR="${SCRIPT_PATH%/*}"
################################################################################
# Documentation on downloading models can be found in the whisper.cpp repo:
# https://github.com/ggerganov/whisper.cpp/#usage
#
# note: unless a multilingual model is specified, WHISPER_LANG will be ignored
# and the video will be transcribed as if the audio were in the English language
################################################################################
MODEL_PATH="${MODEL_PATH:-${SCRIPT_DIR}/../models/ggml-base.en.bin}"
################################################################################
# Where to find the whisper.cpp executable. default to the examples directory
# which holds this script in source control
################################################################################
WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-${SCRIPT_DIR}/../main}";
# Set to desired language to be translated into english
WHISPER_LANG="${WHISPER_LANG:-en}";
# Default to 4 threads (this was most performant on my 2020 M1 MBP)
WHISPER_THREAD_COUNT="${WHISPER_THREAD_COUNT:-4}";
msg() {
echo >&2 -e "${1-}"
}
################################################################################
# create a temporary directory to work in
# set the temp_dir and temp_filename variables
################################################################################
temp_dir="$(mktemp -d ${SCRIPT_DIR}/tmp.XXXXXX)";
temp_filename="${temp_dir}/yt-dlp-filename";
################################################################################
# for now we only take one argument
# TODO: a for loop
################################################################################
source_url="${1}"
title_name="";
cleanup() {
msg "Cleaning up..."
rm -rf "${temp_dir}" "vod-resampled.wav" "vod-resampled.wav.srt"
local -r clean_me="${1}";
if [ -d "${clean_me}" ]; then
msg "Cleaning up...";
rm -rf "${clean_me}";
else
msg "'${clean_me}' does not appear to be a directory!";
exit 1;
fi;
}
print_help() {
echo "################################################################################"
echo "Usage: ./examples/yt-wsp.sh <video_url>"
echo "See configurable env variables in the script"
echo "This will produce an MP4 muxed file called res.mp4 in the working directory"
echo "Requirements: ffmpeg yt-dlp whisper"
echo "Whisper needs to be built into the main binary with make, then you can rename it to something like 'whisper' and add it to your PATH for convenience."
echo "E.g. in the root of Whisper.cpp, run: 'make && cp ./main /usr/local/bin/whisper'"
echo "# See configurable env variables in the script; there are many!"
echo "# This script will produce an MP4 muxed file in the working directory; it will"
echo "# be named for the title and id of the video."
echo "# passing in https://youtu.be/VYJtb2YXae8 produces a file named";
echo "# 'Why_we_all_need_subtitles_now-VYJtb2YXae8-res.mp4'"
echo "# Requirements: ffmpeg yt-dlp whisper.cpp"
echo "################################################################################"
}
check_requirements() {
if ! command -v ffmpeg &>/dev/null; then
echo "ffmpeg is required (https://ffmpeg.org)."
echo "ffmpeg is required: https://ffmpeg.org";
exit 1
fi
fi;
if ! command -v yt-dlp &>/dev/null; then
echo "yt-dlp is required (https://github.com/yt-dlp/yt-dlp)."
exit 1
fi
echo "yt-dlp is required: https://github.com/yt-dlp/yt-dlp";
exit 1;
fi;
if ! command -v "${WHISPER_EXECUTABLE}" &>/dev/null; then
echo "The C++ implementation of Whisper is required: https://github.com/ggerganov/whisper.cpp"
echo "Sample usage:";
echo "";
echo " git clone https://github.com/ggerganov/whisper.cpp";
echo " cd whisper.cpp";
echo " make";
echo " ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890";
echo "";
exit 1;
fi;
if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
WHISPER_EXECUTABLE="./main"
if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
echo "Whisper is required (https://github.com/ggerganov/whisper.cpp):"
echo "Sample usage:"
echo ""
echo " git clone https://github.com/ggerganov/whisper.cpp"
echo " cd whisper.cpp"
echo " make"
echo " ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890"
echo ""
exit 1
fi
fi
}
if [[ $# -lt 1 ]]; then
print_help
exit 1
if [[ "${#}" -lt 1 ]]; then
print_help;
exit 1;
fi
if [[ "$1" == "help" ]]; then
print_help
exit 0
if [[ "${1##-*}" == "help" ]]; then
print_help;
exit 0;
fi
temp_dir="tmp"
source_url="$1"
check_requirements;
check_requirements
msg "Downloading VOD...";
msg "Downloading VOD..."
# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] for members only VODs
################################################################################
# Download the video, put the dynamic output filename into a variable.
# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]
# for videos only available to logged-in users.
################################################################################
yt-dlp \
-f "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" \
-o "${temp_dir}/%(title)s-%(id)s.vod.mp4" \
--print-to-file "%(filename)s" "${temp_filename}" \
--no-simulate \
--no-write-auto-subs \
--restrict-filenames \
--embed-thumbnail \
--embed-chapters \
--xattrs \
"${source_url}" -o "${temp_dir}/vod.mp4"
"${source_url}";
msg "Extracting audio and resampling..."
title_name="$(xargs basename -s .vod.mp4 < ${temp_filename})";
ffmpeg -i "${temp_dir}/vod.mp4" \
msg "Extracting audio and resampling...";
ffmpeg -i "${temp_dir}/${title_name}.vod.mp4" \
-hide_banner \
-vn \
-loglevel error \
-ar 16000 \
-ac 1 \
-c:a \
pcm_s16le -y "vod-resampled.wav"
-c:a pcm_s16le \
-y \
"${temp_dir}/${title_name}.vod-resampled.wav";
msg "Transcribing to subtitle file..."
msg "Whisper specified at: ${WHISPER_EXECUTABLE}"
msg "Transcribing to subtitle file...";
msg "Whisper specified at: '${WHISPER_EXECUTABLE}'";
$WHISPER_EXECUTABLE \
"${WHISPER_EXECUTABLE}" \
-m "${MODEL_PATH}" \
-l "${WHISPER_LANG}" \
-f "vod-resampled.wav" \
-t 8 \
-f "${temp_dir}/${title_name}.vod-resampled.wav" \
-t "${WHISPER_THREAD_COUNT}" \
-osrt \
--translate
--translate;
msg "Embedding subtitle track..."
msg "Embedding subtitle track...";
ffmpeg -i "${temp_dir}/vod.mp4" \
ffmpeg -i "${temp_dir}/${title_name}.vod.mp4" \
-hide_banner \
-loglevel error \
-i "vod-resampled.wav.srt" \
-i "${temp_dir}/${title_name}.vod-resampled.wav.srt" \
-c copy \
-c:s mov_text \
-y res.mp4
-y "${title_name}-res.mp4";
cleanup
cleanup "${temp_dir}";
msg "Done! Your finished file is ready: res.mp4"
msg "Done! Your finished file is ready: ${title_name}-res.mp4";

View File

@ -592,6 +592,8 @@ struct whisper_context {
mutable std::mt19937 rng; // used for sampling at t > 0.0
int lang_id;
// [EXPERIMENTAL] token-level timestamps data
int64_t t_beg;
int64_t t_last;
@ -2903,7 +2905,7 @@ const char * whisper_print_system_info(void) {
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
struct whisper_full_params result = {
/*.strategy =*/ WHISPER_SAMPLING_GREEDY,
/*.strategy =*/ strategy,
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
/*.n_max_text_ctx =*/ 16384,
@ -2922,6 +2924,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.thold_pt =*/ 0.01f,
/*.thold_ptsum =*/ 0.01f,
/*.max_len =*/ 0,
/*.split_on_word =*/ false,
/*.max_tokens =*/ 0,
/*.speed_up =*/ false,
@ -2933,6 +2936,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
/*.language =*/ "en",
/*.suppress_blank =*/ true,
/*.suppress_non_speech_tokens =*/true,
/*.temperature =*/ 0.0f,
/*.max_initial_ts =*/ 1.0f,
@ -2988,9 +2992,35 @@ static void whisper_exp_compute_token_level_timestamps(
float thold_pt,
float thold_ptsum);
// trim from start (in place)
static inline void ltrim(std::string &s) {
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
return !std::isspace(ch);
}));
}
// trim from end (in place)
static inline void rtrim(std::string &s) {
s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
return !std::isspace(ch);
}).base(), s.end());
}
// trim from both ends (in place)
static inline void trim(std::string &s) {
rtrim(s);
ltrim(s);
}
static inline bool should_split_on_word(const char * txt, bool split_on_word) {
if (!split_on_word) return true;
return txt[0] == ' ';
}
// wrap the last segment to max_len characters
// returns the number of new segments
static int whisper_wrap_segment(struct whisper_context & ctx, int max_len) {
static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool split_on_word) {
auto segment = ctx.result_all.back();
int res = 1;
@ -3005,11 +3035,14 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len) {
}
const auto txt = whisper_token_to_str(&ctx, token.id);
const int cur = strlen(txt);
if (acc + cur > max_len && i > 0) {
if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
// split here
if (split_on_word) {
trim(text);
}
ctx.result_all.back().text = std::move(text);
ctx.result_all.back().t1 = token.t0;
ctx.result_all.back().tokens.resize(i);
@ -3037,11 +3070,22 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len) {
}
}
if (split_on_word) {
trim(text);
}
ctx.result_all.back().text = std::move(text);
return res;
}
static const std::vector<std::string> non_speech_tokens
{
"\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
"_", "`", "{", "|", "}", "~", "", "", "", "", "<<", ">>", "<<<", ">>>", "--",
"---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
"♪♪♪","", "", "", "", "", "", ""
};
// process the logits for the selected decoder
// - applies logit filters
// - computes logprobs and probs
@ -3102,6 +3146,33 @@ static void whisper_process_logits(
logits[vocab.token_translate] = -INFINITY;
logits[vocab.token_transcribe] = -INFINITY;
// suppress non-speech tokens
// ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
if (params.suppress_non_speech_tokens)
{
for (const std::string &token : non_speech_tokens)
{
std::string suppress_tokens[] = {token, " " + token};
for (const std::string &suppress_token : suppress_tokens)
{
if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end())
{
logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
}
}
}
// allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
if (vocab.token_to_id.find(" -") != vocab.token_to_id.end())
{
logits[vocab.token_to_id.at(" -")] = -INFINITY;
}
if (vocab.token_to_id.find(" '") != vocab.token_to_id.end())
{
logits[vocab.token_to_id.at(" '")] = -INFINITY;
}
}
// timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
// https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L414-L424
{
@ -3449,7 +3520,7 @@ int whisper_full(
fprintf(stderr, "%s: failed to auto-detect language\n", __func__);
return -3;
}
ctx->lang_id = lang_id;
params.language = whisper_lang_str(lang_id);
fprintf(stderr, "%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
@ -3546,6 +3617,7 @@ int whisper_full(
std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
if (whisper_is_multilingual(ctx)) {
const int lang_id = whisper_lang_id(params.language);
ctx->lang_id = lang_id;
prompt_init.push_back(whisper_token_lang(ctx, lang_id));
if (params.translate) {
prompt_init.push_back(whisper_token_translate());
@ -3793,7 +3865,7 @@ int whisper_full(
auto & cur = beam_candidates[cur_c++];
while (beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) {
while (beam_candidates.size() > cur_c && beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) {
++cur_c;
}
@ -4069,7 +4141,7 @@ int whisper_full(
*ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
if (params.max_len > 0) {
n_new = whisper_wrap_segment(*ctx, params.max_len);
n_new = whisper_wrap_segment(*ctx, params.max_len, params.split_on_word);
}
}
if (params.new_segment_callback) {
@ -4113,7 +4185,7 @@ int whisper_full(
*ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum);
if (params.max_len > 0) {
n_new = whisper_wrap_segment(*ctx, params.max_len);
n_new = whisper_wrap_segment(*ctx, params.max_len, params.split_on_word);
}
}
if (params.new_segment_callback) {
@ -4266,6 +4338,10 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
return ctx->result_all.size();
}
int whisper_full_lang_id(struct whisper_context * ctx) {
return ctx->lang_id;
}
int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
return ctx->result_all[i_segment].t0;
}

View File

@ -113,6 +113,16 @@ extern "C" {
int n_samples,
int n_threads);
// Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
// The resulting spectrogram is stored inside the provided whisper context.
// Returns 0 on success
WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
struct whisper_context* ctx,
const float* samples,
int n_samples,
int n_threads);
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
// n_mel must be 80
@ -257,6 +267,7 @@ extern "C" {
float thold_pt; // timestamp token probability threshold (~0.01)
float thold_ptsum; // timestamp token sum probability threshold (~0.01)
int max_len; // max segment length in characters
bool split_on_word; // split on word rather than on token (when used with max_len)
int max_tokens; // max tokens per segment (0 = no limit)
// [EXPERIMENTAL] speed-up techniques
@ -274,6 +285,7 @@ extern "C" {
// common decoding parameters:
bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
@ -329,6 +341,9 @@ extern "C" {
// A segment can be a few words, a sentence, or even a paragraph.
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
// Language id associated with the current context
WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
// Get the start and end time of the specified segment.
WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);