wip ignore

yt-wsp.sh : add unique filename generation (#495 )
Co-authored-by: genevera <genevera@noreply.users.github.com>
2025-07-03 07:50:28 +02:00 · 2023-02-15 19:11:12 +02:00 · 2023-02-14 20:12:51 +02:00 · 2023-02-14 20:04:03 +02:00 · 2023-02-11 17:35:33 +02:00 · 2023-02-11 09:13:32 +02:00
14 changed files with 959 additions and 84 deletions
--- a/.github/workflows/examples.yml
+++ b/.github/workflows/examples.yml
@ -0,0 +1,48 @@
+name: Examples Tests
+on:
+  push:
+    paths:
+      - examples/addon.node/**
+      - whisper.h
+  pull_request:
+    paths:
+      - examples/addon.node/**
+      - whisper.h
+
+jobs:
+  addon_node-ubuntu-latest:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        node-version: [ 16.x, 18.x ]
+    steps:
+      - name: Clone
+        uses: actions/checkout@v1
+
+      - name: Dependencies
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential
+          sudo apt-get install cmake
+          sudo apt-get install libsdl2-dev
+
+      - name: Use Node.js ${{ matrix.node-version }}
+        uses: actions/setup-node@v1
+        with:
+          node-version: ${{ matrix.node-version }}
+          cache: 'npm'
+
+      - name: Install package.json dependencies
+        working-directory: ./examples/addon.node
+        run: npm install
+
+      - name: Compile addon.node
+        run: npx cmake-js compile -T whisper-addon -B Release
+
+      - name: Download test model
+        run: |
+          bash ./models/download-ggml-model.sh base.en
+      - name: Test
+        run: |
+          cd examples/addon.node
+          npm run test
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -226,10 +226,13 @@ target_compile_definitions(${TARGET} PUBLIC
    ${WHISPER_EXTRA_FLAGS}
    )

+set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "whisper.h")
+
 install(TARGETS ${TARGET}
    LIBRARY DESTINATION lib
    ARCHIVE DESTINATION lib/static
    RUNTIME DESTINATION bin
+    PUBLIC_HEADER DESTINATION include
    )

 #
@ -242,7 +245,7 @@ add_subdirectory(bindings)
 # programs, examples and tests
 #

-if (WHISPER_BUILD_TESTS)
+if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
    enable_testing()
    add_subdirectory(tests)
 endif ()
--- a/README.md
+++ b/README.md
@ -465,6 +465,9 @@ in [models](models).
 - [X] Javascript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
 - [X] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
 - [X] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
+- [X] .NET:
+  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
+  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
 - [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)

 ## Examples
--- a/examples/addon.node/CMakeLists.txt
+++ b/examples/addon.node/CMakeLists.txt
@ -24,3 +24,8 @@ target_include_directories(${TARGET} PRIVATE ${NODE_ADDON_API_DIR})
 #==================================================================

 target_link_libraries(${TARGET} ${CMAKE_JS_LIB} whisper ${CMAKE_THREAD_LIBS_INIT})
+
+if(MSVC AND CMAKE_JS_NODELIB_DEF AND CMAKE_JS_NODELIB_TARGET)
+    # Generate node.lib
+    execute_process(COMMAND ${CMAKE_AR} /def:${CMAKE_JS_NODELIB_DEF} /out:${CMAKE_JS_NODELIB_TARGET} ${CMAKE_STATIC_LINKER_FLAGS})
+endif()
--- a/examples/addon.node/README.md
+++ b/examples/addon.node/README.md
@ -14,14 +14,14 @@ npm install
 Make sure it is in the project root directory and compiled with make-js.

 ```shell
-npx cmake-js compile -T whisper-addon
+npx cmake-js compile -T whisper-addon -B Release
 ```

 For Electron addon and cmake-js options, you can see [cmake-js](https://github.com/cmake-js/cmake-js) and make very few configuration changes.

 > Such as appointing special cmake path:
 > ```shell
-> npx cmake-js compile -c 'xxx/cmake' -T whisper-addon
+> npx cmake-js compile -c 'xxx/cmake' -T whisper-addon -B Release
 > ```

 ## Run
--- a/examples/addon.node/test/whisper.spec.js
+++ b/examples/addon.node/test/whisper.spec.js
@ -0,0 +1,15 @@
+const path = require('path');
+const { whisper } = require(path.join(__dirname, '../../../build/Release/whisper-addon'));
+
+const whisperParamsMock = {
+    language: 'en',
+    model: path.join(__dirname, '../../../models/ggml-base.en.bin'),
+    fname_inp: path.join(__dirname, '../../../samples/jfk.wav'),
+};
+
+describe("Run whisper.node", () => {
+
+    test("it should receive a non-empty value", () => {
+        expect(whisper(whisperParamsMock).length).toBeGreaterThan(0);
+    });
+});
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -1,3 +1,4 @@
+#include <cstdint>
 #include <string>
 #include <thread>
 #include <vector>
@ -398,9 +399,9 @@ Napi::Object whisper(const Napi::CallbackInfo& info) {
    }

    Napi::Object res = Napi::Array::New(env, result.size());
-    for (u_int32_t i = 0; i < result.size(); ++i) {
+    for (uint64_t i = 0; i < result.size(); ++i) {
        Napi::Object tmp = Napi::Array::New(env, 3);
-        for (u_int32_t j = 0; j < 3; ++j) {
+        for (uint64_t j = 0; j < 3; ++j) {
            tmp[j] = Napi::String::New(env, result[i][j]);
        }
        res[i] = tmp;
--- a/examples/addon.node/package.json
+++ b/examples/addon.node/package.json
@ -5,8 +5,12 @@
  "main": "index.js",
  "author": "Qanhe Chen",
  "license": "MIT",
+  "scripts": {
+    "test": "jest"
+  },
  "devDependencies": {
    "cmake-js": "^7.1.1",
+    "jest": "^29.4.0",
    "node-addon-api": "^5.0.0"
  }
 }
--- a/examples/chess/CMakeLists.txt
+++ b/examples/chess/CMakeLists.txt
@ -0,0 +1,10 @@
+if (WHISPER_SUPPORT_SDL2)
+    # chess
+    set(TARGET chess)
+    add_executable(${TARGET} chess.cpp)
+
+    include(DefaultTargetOptions)
+
+    target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE common whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
+endif ()
--- a/examples/chess/chess.cpp
+++ b/examples/chess/chess.cpp
@ -0,0 +1,634 @@
+// Input chess moves via voice
+//
+
+#include "common.h"
+#include "whisper.h"
+
+#include <SDL.h>
+#include <SDL_audio.h>
+
+#include <atomic>
+#include <cassert>
+#include <cstdio>
+#include <string>
+#include <thread>
+#include <vector>
+#include <fstream>
+#include <mutex>
+
+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t) {
+    int64_t sec = t/100;
+    int64_t msec = t - sec*100;
+    int64_t min = sec/60;
+    sec = sec - min*60;
+
+    char buf[32];
+    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
+
+    return std::string(buf);
+}
+
+// command-line parameters
+struct whisper_params {
+    int32_t n_threads  = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t step_ms    = 3000;
+    int32_t length_ms  = 10000;
+    int32_t keep_ms    = 200;
+    int32_t capture_id = -1;
+    int32_t max_tokens = 32;
+    int32_t audio_ctx  = 0;
+
+    float vad_thold    = 0.6f;
+    float freq_thold   = 100.0f;
+
+    bool translate     = false;
+    bool print_special = false;
+    bool no_context    = true;
+    bool no_timestamps = false;
+
+    std::string language  = "en";
+    std::string model     = "models/ggml-base.en.bin";
+    std::string fname_inp;
+};
+
+void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
+
+bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
+
+        if (arg == "-h" || arg == "--help") {
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+        else if (arg == "-t"   || arg == "--threads")       { params.n_threads     = std::stoi(argv[++i]); }
+        else if (                 arg == "--step")          { params.step_ms       = std::stoi(argv[++i]); }
+        else if (                 arg == "--length")        { params.length_ms     = std::stoi(argv[++i]); }
+        else if (                 arg == "--keep")          { params.keep_ms       = std::stoi(argv[++i]); }
+        else if (arg == "-c"   || arg == "--capture")       { params.capture_id    = std::stoi(argv[++i]); }
+        else if (arg == "-mt"  || arg == "--max-tokens")    { params.max_tokens    = std::stoi(argv[++i]); }
+        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
+        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
+        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
+        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
+        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
+        else if (arg == "-kc"  || arg == "--keep-context")  { params.no_context    = false; }
+        else if (arg == "-l"   || arg == "--language")      { params.language      = argv[++i]; }
+        else if (arg == "-m"   || arg == "--model")         { params.model         = argv[++i]; }
+        else if (arg == "-f"   || arg == "--file")          { params.fname_inp     = argv[++i]; }
+        else {
+            fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
+            whisper_print_usage(argc, argv, params);
+            exit(0);
+        }
+    }
+
+    return true;
+}
+
+void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
+    fprintf(stderr, "\n");
+    fprintf(stderr, "usage: %s [options]\n", argv[0]);
+    fprintf(stderr, "\n");
+    fprintf(stderr, "options:\n");
+    fprintf(stderr, "  -h,       --help          [default] show this help message and exit\n");
+    fprintf(stderr, "  -t N,     --threads N     [%-7d] number of threads to use during computation\n", params.n_threads);
+    fprintf(stderr, "            --step N        [%-7d] audio step size in milliseconds\n",             params.step_ms);
+    fprintf(stderr, "            --length N      [%-7d] audio length in milliseconds\n",                params.length_ms);
+    fprintf(stderr, "            --keep N        [%-7d] audio to keep from previous step in ms\n",      params.keep_ms);
+    fprintf(stderr, "  -c ID,    --capture ID    [%-7d] capture device ID\n",                           params.capture_id);
+    fprintf(stderr, "  -mt N,    --max-tokens N  [%-7d] maximum number of tokens per audio chunk\n",    params.max_tokens);
+    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
+    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
+    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
+    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
+    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
+    fprintf(stderr, "  -kc,      --keep-context  [%-7s] keep context between audio chunks\n",           params.no_context ? "false" : "true");
+    fprintf(stderr, "  -l LANG,  --language LANG [%-7s] spoken language\n",                             params.language.c_str());
+    fprintf(stderr, "  -m FNAME, --model FNAME   [%-7s] model path\n",                                  params.model.c_str());
+    fprintf(stderr, "  -f FNAME, --file FNAME    [%-7s] input WAV file path\n",                         params.fname_inp.c_str());
+    fprintf(stderr, "\n");
+}
+
+//
+// SDL Audio capture
+//
+
+class audio_async {
+public:
+    audio_async(int len_ms);
+    ~audio_async();
+
+    bool init(int capture_id, int sample_rate);
+
+    // start capturing audio via the provided SDL callback
+    // keep last len_ms seconds of audio in a circular buffer
+    bool resume();
+    bool pause();
+    bool clear();
+
+    // callback to be called by SDL
+    void callback(uint8_t * stream, int len);
+
+    // get audio data from the circular buffer
+    void get(int ms, std::vector<float> & audio);
+
+private:
+    SDL_AudioDeviceID m_dev_id_in = 0;
+
+    int m_len_ms = 0;
+    int m_sample_rate = 0;
+
+    std::atomic_bool m_running;
+    std::mutex       m_mutex;
+
+    std::vector<float> m_audio;
+    std::vector<float> m_audio_new;
+    size_t             m_audio_pos = 0;
+    size_t             m_audio_len = 0;
+};
+
+audio_async::audio_async(int len_ms) {
+    m_len_ms = len_ms;
+
+    m_running = false;
+}
+
+audio_async::~audio_async() {
+    if (m_dev_id_in) {
+        SDL_CloseAudioDevice(m_dev_id_in);
+    }
+}
+
+bool audio_async::init(int capture_id, int sample_rate) {
+    SDL_LogSetPriority(SDL_LOG_CATEGORY_APPLICATION, SDL_LOG_PRIORITY_INFO);
+
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        SDL_LogError(SDL_LOG_CATEGORY_APPLICATION, "Couldn't initialize SDL: %s\n", SDL_GetError());
+        return false;
+    }
+
+    SDL_SetHintWithPriority(SDL_HINT_AUDIO_RESAMPLING_MODE, "medium", SDL_HINT_OVERRIDE);
+
+    {
+        int nDevices = SDL_GetNumAudioDevices(SDL_TRUE);
+        fprintf(stderr, "%s: found %d capture devices:\n", __func__, nDevices);
+        for (int i = 0; i < nDevices; i++) {
+            fprintf(stderr, "%s:    - Capture device #%d: '%s'\n", __func__, i, SDL_GetAudioDeviceName(i, SDL_TRUE));
+        }
+    }
+
+    SDL_AudioSpec capture_spec_requested;
+    SDL_AudioSpec capture_spec_obtained;
+
+    SDL_zero(capture_spec_requested);
+    SDL_zero(capture_spec_obtained);
+
+    capture_spec_requested.freq     = sample_rate;
+    capture_spec_requested.format   = AUDIO_F32;
+    capture_spec_requested.channels = 1;
+    capture_spec_requested.samples  = 1024;
+    capture_spec_requested.callback = [](void * userdata, uint8_t * stream, int len) {
+        audio_async * audio = (audio_async *) userdata;
+        audio->callback(stream, len);
+    };
+    capture_spec_requested.userdata = this;
+
+    if (capture_id >= 0) {
+        fprintf(stderr, "%s: attempt to open capture device %d : '%s' ...\n", __func__, capture_id, SDL_GetAudioDeviceName(capture_id, SDL_TRUE));
+        m_dev_id_in = SDL_OpenAudioDevice(SDL_GetAudioDeviceName(capture_id, SDL_TRUE), SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    } else {
+        fprintf(stderr, "%s: attempt to open default capture device ...\n", __func__);
+        m_dev_id_in = SDL_OpenAudioDevice(nullptr, SDL_TRUE, &capture_spec_requested, &capture_spec_obtained, 0);
+    }
+
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: couldn't open an audio device for capture: %s!\n", __func__, SDL_GetError());
+        m_dev_id_in = 0;
+
+        return false;
+    } else {
+        fprintf(stderr, "%s: obtained spec for input device (SDL Id = %d):\n", __func__, m_dev_id_in);
+        fprintf(stderr, "%s:     - sample rate:       %d\n",                   __func__, capture_spec_obtained.freq);
+        fprintf(stderr, "%s:     - format:            %d (required: %d)\n",    __func__, capture_spec_obtained.format,
+                capture_spec_requested.format);
+        fprintf(stderr, "%s:     - channels:          %d (required: %d)\n",    __func__, capture_spec_obtained.channels,
+                capture_spec_requested.channels);
+        fprintf(stderr, "%s:     - samples per frame: %d\n",                   __func__, capture_spec_obtained.samples);
+    }
+
+    m_sample_rate = capture_spec_obtained.freq;
+
+    m_audio.resize((m_sample_rate*m_len_ms)/1000);
+
+    return true;
+}
+
+bool audio_async::resume() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to resume!\n", __func__);
+        return false;
+    }
+
+    if (m_running) {
+        fprintf(stderr, "%s: already running!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 0);
+
+    m_running = true;
+
+    return true;
+}
+
+bool audio_async::pause() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to pause!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: already paused!\n", __func__);
+        return false;
+    }
+
+    SDL_PauseAudioDevice(m_dev_id_in, 1);
+
+    m_running = false;
+
+    return true;
+}
+
+bool audio_async::clear() {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to clear!\n", __func__);
+        return false;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return false;
+    }
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        m_audio_pos = 0;
+        m_audio_len = 0;
+    }
+
+    return true;
+}
+
+// callback to be called by SDL
+void audio_async::callback(uint8_t * stream, int len) {
+    if (!m_running) {
+        return;
+    }
+
+    const size_t n_samples = len / sizeof(float);
+
+    m_audio_new.resize(n_samples);
+    memcpy(m_audio_new.data(), stream, n_samples * sizeof(float));
+
+    //fprintf(stderr, "%s: %zu samples, pos %zu, len %zu\n", __func__, n_samples, m_audio_pos, m_audio_len);
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (m_audio_pos + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - m_audio_pos;
+
+            memcpy(&m_audio[m_audio_pos], stream, n0 * sizeof(float));
+            memcpy(&m_audio[0], &stream[n0], (n_samples - n0) * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = m_audio.size();
+        } else {
+            memcpy(&m_audio[m_audio_pos], stream, n_samples * sizeof(float));
+
+            m_audio_pos = (m_audio_pos + n_samples) % m_audio.size();
+            m_audio_len = std::min(m_audio_len + n_samples, m_audio.size());
+        }
+    }
+}
+
+void audio_async::get(int ms, std::vector<float> & result) {
+    if (!m_dev_id_in) {
+        fprintf(stderr, "%s: no audio device to get audio from!\n", __func__);
+        return;
+    }
+
+    if (!m_running) {
+        fprintf(stderr, "%s: not running!\n", __func__);
+        return;
+    }
+
+    result.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(m_mutex);
+
+        if (ms <= 0) {
+            ms = m_len_ms;
+        }
+
+        size_t n_samples = (m_sample_rate * ms) / 1000;
+        if (n_samples > m_audio_len) {
+            n_samples = m_audio_len;
+        }
+
+        result.resize(n_samples);
+
+        int s0 = m_audio_pos - n_samples;
+        if (s0 < 0) {
+            s0 += m_audio.size();
+        }
+
+        if (s0 + n_samples > m_audio.size()) {
+            const size_t n0 = m_audio.size() - s0;
+
+            memcpy(result.data(), &m_audio[s0], n0 * sizeof(float));
+            memcpy(&result[n0], &m_audio[0], (n_samples - n0) * sizeof(float));
+        } else {
+            memcpy(result.data(), &m_audio[s0], n_samples * sizeof(float));
+        }
+    }
+}
+
+///////////////////////////
+
+int main(int argc, char ** argv) {
+    whisper_params params;
+
+    if (whisper_params_parse(argc, argv, params) == false) {
+        return 1;
+    }
+
+    params.keep_ms   = std::min(params.keep_ms,   params.step_ms);
+    params.length_ms = std::max(params.length_ms, params.step_ms);
+
+    const int n_samples_step = (1e-3*params.step_ms  )*WHISPER_SAMPLE_RATE;
+    const int n_samples_len  = (1e-3*params.length_ms)*WHISPER_SAMPLE_RATE;
+    const int n_samples_keep = (1e-3*params.keep_ms  )*WHISPER_SAMPLE_RATE;
+    const int n_samples_30s  = (1e-3*30000.0         )*WHISPER_SAMPLE_RATE;
+
+    const bool use_vad = n_samples_step <= 0; // sliding window mode uses VAD
+
+    const int n_new_line = !use_vad ? std::max(1, params.length_ms / params.step_ms - 1) : 1; // number of steps to print new line
+
+    params.no_timestamps  = !use_vad;
+    params.no_context    |= use_vad;
+    params.max_tokens     = 0;
+
+    // init audio
+
+    audio_async audio(params.length_ms);
+    if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
+        fprintf(stderr, "%s: audio.init() failed!\n", __func__);
+        return 1;
+    }
+
+    audio.resume();
+
+    // whisper init
+
+    if (whisper_lang_id(params.language.c_str()) == -1) {
+        fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
+        whisper_print_usage(argc, argv, params);
+        exit(0);
+    }
+
+    struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
+
+    std::vector<float> pcmf32    (n_samples_30s, 0.0f);
+    std::vector<float> pcmf32_old;
+    std::vector<float> pcmf32_new(n_samples_30s, 0.0f);
+
+    std::vector<whisper_token> prompt_tokens;
+
+    // print some info about the processing
+    {
+        fprintf(stderr, "\n");
+        if (!whisper_is_multilingual(ctx)) {
+            if (params.language != "en" || params.translate) {
+                params.language = "en";
+                params.translate = false;
+                fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
+            }
+        }
+        fprintf(stderr, "%s: processing %d samples (step = %.1f sec / len = %.1f sec / keep = %.1f sec), %d threads, lang = %s, task = %s, timestamps = %d ...\n",
+                __func__,
+                n_samples_step,
+                float(n_samples_step)/WHISPER_SAMPLE_RATE,
+                float(n_samples_len )/WHISPER_SAMPLE_RATE,
+                float(n_samples_keep)/WHISPER_SAMPLE_RATE,
+                params.n_threads,
+                params.language.c_str(),
+                params.translate ? "translate" : "transcribe",
+                params.no_timestamps ? 0 : 1);
+
+        if (!use_vad) {
+            fprintf(stderr, "%s: n_new_line = %d, no_context = %d\n", __func__, n_new_line, params.no_context);
+        } else {
+            fprintf(stderr, "%s: using VAD, will transcribe on speech activity\n", __func__);
+        }
+
+        fprintf(stderr, "\n");
+    }
+
+    int n_iter = 0;
+
+    bool is_running = true;
+
+    printf("[Start speaking]");
+    fflush(stdout);
+
+          auto t_last  = std::chrono::high_resolution_clock::now();
+    const auto t_start = t_last;
+
+    // main audio loop
+    while (is_running) {
+        // handle Ctrl + C
+        {
+            SDL_Event event;
+            while (SDL_PollEvent(&event)) {
+                switch (event.type) {
+                    case SDL_QUIT:
+                        {
+                            is_running = false;
+                        } break;
+                    default:
+                        break;
+                }
+            }
+
+            if (!is_running) {
+                break;
+            }
+        }
+
+        if (!is_running) {
+            break;
+        }
+
+        // process new audio
+
+        if (!use_vad) {
+            while (true) {
+                audio.get(params.step_ms, pcmf32_new);
+
+                if ((int) pcmf32_new.size() > 2*n_samples_step) {
+                    fprintf(stderr, "\n\n%s: WARNING: cannot process audio fast enough, dropping audio ...\n\n", __func__);
+                    audio.clear();
+                    continue;
+                }
+
+                if ((int) pcmf32_new.size() >= n_samples_step) {
+                    audio.clear();
+                    break;
+                }
+
+                SDL_Delay(1);
+            }
+
+            const int n_samples_new = pcmf32_new.size();
+
+            // take up to params.length_ms audio from previous iteration
+            const int n_samples_take = std::min((int) pcmf32_old.size(), std::max(0, n_samples_keep + n_samples_len - n_samples_new));
+
+            //printf("processing: take = %d, new = %d, old = %d\n", n_samples_take, n_samples_new, (int) pcmf32_old.size());
+
+            pcmf32.resize(n_samples_new + n_samples_take);
+
+            for (int i = 0; i < n_samples_take; i++) {
+                pcmf32[i] = pcmf32_old[pcmf32_old.size() - n_samples_take + i];
+            }
+
+            memcpy(pcmf32.data() + n_samples_take, pcmf32_new.data(), n_samples_new*sizeof(float));
+
+            pcmf32_old = pcmf32;
+        } else {
+            const auto t_now  = std::chrono::high_resolution_clock::now();
+            const auto t_diff = std::chrono::duration_cast<std::chrono::milliseconds>(t_now - t_last).count();
+
+            if (t_diff < 2000) {
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+                continue;
+            }
+
+            audio.get(2000, pcmf32_new);
+
+            if (vad_simple(pcmf32_new, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, false)) {
+                audio.get(params.length_ms, pcmf32);
+            } else {
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+
+                continue;
+            }
+
+            t_last = t_now;
+        }
+
+        // run the inference
+        {
+            whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+
+            wparams.print_progress   = false;
+            wparams.print_special    = params.print_special;
+            wparams.print_realtime   = false;
+            wparams.print_timestamps = !params.no_timestamps;
+            wparams.translate        = params.translate;
+            wparams.no_context       = true;
+            wparams.single_segment   = !use_vad;
+            wparams.max_tokens       = params.max_tokens;
+            wparams.language         = params.language.c_str();
+            wparams.n_threads        = params.n_threads;
+
+            wparams.audio_ctx        = params.audio_ctx;
+
+            // disable temperature fallback
+            wparams.temperature_inc  = -1.0f;
+
+            wparams.prompt_tokens    = params.no_context ? nullptr : prompt_tokens.data();
+            wparams.prompt_n_tokens  = params.no_context ? 0       : prompt_tokens.size();
+
+            if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
+                fprintf(stderr, "%s: failed to process audio\n", argv[0]);
+                return 6;
+            }
+
+            // print result;
+            {
+                if (!use_vad) {
+                    printf("\33[2K\r");
+
+                    // print long empty line to clear the previous line
+                    printf("%s", std::string(100, ' ').c_str());
+
+                    printf("\33[2K\r");
+                } else {
+                    const int64_t t1 = (t_last - t_start).count()/1000000;
+                    const int64_t t0 = std::max(0.0, t1 - pcmf32.size()*1000.0/WHISPER_SAMPLE_RATE);
+
+                    printf("\n");
+                    printf("### Transcription %d START | t0 = %d ms | t1 = %d ms\n", n_iter, (int) t0, (int) t1);
+                    printf("\n");
+                }
+
+                const int n_segments = whisper_full_n_segments(ctx);
+                for (int i = 0; i < n_segments; ++i) {
+                    const char * text = whisper_full_get_segment_text(ctx, i);
+
+                    if (params.no_timestamps) {
+                        printf("%s", text);
+                        fflush(stdout);
+                    } else {
+                        const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                        const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+                        printf ("[%s --> %s]  %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
+                    }
+                }
+
+                if (use_vad){
+                    printf("\n");
+                    printf("### Transcription %d END\n", n_iter);
+                }
+            }
+
+            ++n_iter;
+
+            if (!use_vad && (n_iter % n_new_line) == 0) {
+                printf("\n");
+
+                // keep part of the audio for next iteration to try to mitigate word boundary issues
+                pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
+
+                // Add tokens of the last full length segment as the prompt
+                if (!params.no_context) {
+                    prompt_tokens.clear();
+
+                    const int n_segments = whisper_full_n_segments(ctx);
+                    for (int i = 0; i < n_segments; ++i) {
+                        const int token_count = whisper_full_n_tokens(ctx, i);
+                        for (int j = 0; j < token_count; ++j) {
+                            prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    audio.pause();
+
+    whisper_print_timings(ctx);
+    whisper_free(ctx);
+
+    return 0;
+}
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -69,6 +69,7 @@ struct whisper_params {
    bool speed_up       = false;
    bool translate      = false;
    bool diarize        = false;
+    bool split_on_word  = false;
    bool no_fallback    = false;
    bool output_txt     = false;
    bool output_vtt     = false;
@ -118,6 +119,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-su"   || arg == "--speed-up")       { params.speed_up       = true; }
        else if (arg == "-tr"   || arg == "--translate")      { params.translate      = true; }
        else if (arg == "-di"   || arg == "--diarize")        { params.diarize        = true; }
+        else if (arg == "-sow"  || arg == "--split-on-word")  { params.split_on_word  = true; }
        else if (arg == "-nf"   || arg == "--no-fallback")    { params.no_fallback    = true; }
        else if (arg == "-otxt" || arg == "--output-txt")     { params.output_txt     = true; }
        else if (arg == "-ovtt" || arg == "--output-vtt")     { params.output_vtt     = true; }
@ -156,6 +158,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -d  N,     --duration N        [%-7d] duration of audio to process in milliseconds\n",   params.duration_ms);
    fprintf(stderr, "  -mc N,     --max-context N     [%-7d] maximum number of text context tokens to store\n", params.max_context);
    fprintf(stderr, "  -ml N,     --max-len N         [%-7d] maximum segment length in characters\n",           params.max_len);
+    fprintf(stderr, "  -sow,      --split-on-word     [%-7s] split on word rather than on token\n",             params.split_on_word ? "true" : "false");
    fprintf(stderr, "  -bo N,     --best-of N         [%-7d] number of best candidates to keep\n",              params.best_of);
    fprintf(stderr, "  -bs N,     --beam-size N       [%-7d] beam size for beam search\n",                      params.beam_size);
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
@ -651,6 +654,7 @@ int main(int argc, char ** argv) {
            wparams.token_timestamps = params.output_wts || params.max_len > 0;
            wparams.thold_pt         = params.word_thold;
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
+            wparams.split_on_word    = params.split_on_word;

            wparams.speed_up         = params.speed_up;

--- a/examples/yt-wsp.sh
+++ b/examples/yt-wsp.sh
@ -1,20 +1,10 @@
 #!/usr/bin/env bash
-
-# Small shell script to more easily automatically download and transcribe live stream VODs.
-# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
-# Use `./examples/yt-wsp.sh help` to print help info.
-#
-# Sample usage:
-#
-#   git clone https://github.com/ggerganov/whisper.cpp
-#   cd whisper.cpp
-#   make
-#   ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
-#
+# shellcheck disable=2086

 # MIT License

 # Copyright (c) 2022 Daniils Petrovs
+# Copyright (c) 2023 Jennifer Capasso

 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@ -34,114 +24,181 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.

+# Small shell script to more easily automatically download and transcribe live stream VODs.
+# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
+# Use `./examples/yt-wsp.sh help` to print help info.
+#
+# Sample usage:
+#
+#   git clone https://github.com/ggerganov/whisper.cpp
+#   cd whisper.cpp
+#   make
+#   ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
+#
+
 set -Eeuo pipefail

-# You can find how to download models in the OG repo: https://github.com/ggerganov/whisper.cpp/#usage
-MODEL_PATH="${MODEL_PATH:-models/ggml-base.en.bin}" # Set to a multilingual model if you want to translate from foreign lang to en
-WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-whisper}" # Where to find the whisper.cpp executable
-WHISPER_LANG="${WHISPER_LANG:-en}" # Set to desired lang to translate from
+# get script file location
+SCRIPT_PATH="$(realpath -e ${BASH_SOURCE[0]})";
+SCRIPT_DIR="${SCRIPT_PATH%/*}"
+
+################################################################################
+# Documentation on downloading models can be found in the whisper.cpp repo:
+# https://github.com/ggerganov/whisper.cpp/#usage
+#
+# note: unless a multilingual model is specified, WHISPER_LANG will be ignored
+# and the video will be transcribed as if the audio were in the English language
+################################################################################
+MODEL_PATH="${MODEL_PATH:-${SCRIPT_DIR}/../models/ggml-base.en.bin}"
+
+################################################################################
+# Where to find the whisper.cpp executable.  default to the examples directory
+# which holds this script in source control
+################################################################################
+WHISPER_EXECUTABLE="${WHISPER_EXECUTABLE:-${SCRIPT_DIR}/../main}";
+
+# Set to desired language to be translated into english
+WHISPER_LANG="${WHISPER_LANG:-en}";
+
+# Default to 4 threads (this was most performant on my 2020 M1 MBP)
+WHISPER_THREAD_COUNT="${WHISPER_THREAD_COUNT:-4}";

 msg() {
    echo >&2 -e "${1-}"
 }

+################################################################################
+# create a temporary directory to work in
+# set the temp_dir and temp_filename variables
+################################################################################
+temp_dir="$(mktemp -d ${SCRIPT_DIR}/tmp.XXXXXX)";
+temp_filename="${temp_dir}/yt-dlp-filename";
+
+################################################################################
+# for now we only take one argument
+# TODO: a for loop
+################################################################################
+source_url="${1}"
+
+
+title_name="";
+
+
 cleanup() {
-    msg "Cleaning up..."
-    rm -rf "${temp_dir}" "vod-resampled.wav" "vod-resampled.wav.srt"
+    local -r clean_me="${1}";
+
+    if [ -d "${clean_me}" ]; then
+      msg "Cleaning up...";
+      rm -rf "${clean_me}";
+    else
+      msg "'${clean_me}' does not appear to be a directory!";
+      exit 1;
+    fi;
 }

 print_help() {
+    echo "################################################################################"
    echo "Usage: ./examples/yt-wsp.sh <video_url>"
-    echo "See configurable env variables in the script"
-    echo "This will produce an MP4 muxed file called res.mp4 in the working directory"
-    echo "Requirements: ffmpeg yt-dlp whisper"
-    echo "Whisper needs to be built into the main binary with make, then you can rename it to something like 'whisper' and add it to your PATH for convenience."
-    echo "E.g. in the root of Whisper.cpp, run: 'make && cp ./main /usr/local/bin/whisper'"
+    echo "# See configurable env variables in the script; there are many!"
+    echo "# This script will produce an MP4 muxed file in the working directory; it will"
+    echo "# be named for the title and id of the video."
+    echo "# passing in https://youtu.be/VYJtb2YXae8 produces a file named";
+    echo "# 'Why_we_all_need_subtitles_now-VYJtb2YXae8-res.mp4'"
+    echo "# Requirements: ffmpeg yt-dlp whisper.cpp"
+    echo "################################################################################"
 }

 check_requirements() {
    if ! command -v ffmpeg &>/dev/null; then
-        echo "ffmpeg is required (https://ffmpeg.org)."
+        echo "ffmpeg is required: https://ffmpeg.org";
        exit 1
-    fi
+    fi;

    if ! command -v yt-dlp &>/dev/null; then
-        echo "yt-dlp is required (https://github.com/yt-dlp/yt-dlp)."
-        exit 1
-    fi
+        echo "yt-dlp is required: https://github.com/yt-dlp/yt-dlp";
+        exit 1;
+    fi;
+
+    if ! command -v "${WHISPER_EXECUTABLE}" &>/dev/null; then
+        echo "The C++ implementation of Whisper is required: https://github.com/ggerganov/whisper.cpp"
+        echo "Sample usage:";
+        echo "";
+        echo "  git clone https://github.com/ggerganov/whisper.cpp";
+        echo "  cd whisper.cpp";
+        echo "  make";
+        echo "  ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890";
+        echo "";
+        exit 1;
+    fi;

-    if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
-        WHISPER_EXECUTABLE="./main"
-        if ! command -v "$WHISPER_EXECUTABLE" &>/dev/null; then
-            echo "Whisper is required (https://github.com/ggerganov/whisper.cpp):"
-            echo "Sample usage:"
-            echo ""
-            echo "  git clone https://github.com/ggerganov/whisper.cpp"
-            echo "  cd whisper.cpp"
-            echo "  make"
-            echo "  ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890"
-            echo ""
-            exit 1
-        fi
-    fi
 }

-if [[ $# -lt 1 ]]; then
-    print_help
-    exit 1
+if [[ "${#}" -lt 1 ]]; then
+    print_help;
+    exit 1;
 fi

-if [[ "$1" == "help" ]]; then
-    print_help
-    exit 0
+if [[ "${1##-*}" == "help" ]]; then
+    print_help;
+    exit 0;
 fi

-temp_dir="tmp"
-source_url="$1"
+check_requirements;

-check_requirements
+msg "Downloading VOD...";

-msg "Downloading VOD..."
-
-# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER] for members only VODs
+################################################################################
+# Download the video, put the dynamic output filename into a variable.
+# Optionally add --cookies-from-browser BROWSER[+KEYRING][:PROFILE][::CONTAINER]
+# for videos only available to logged-in users.
+################################################################################
 yt-dlp \
    -f "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best" \
+    -o "${temp_dir}/%(title)s-%(id)s.vod.mp4" \
+    --print-to-file "%(filename)s" "${temp_filename}" \
+    --no-simulate \
+    --no-write-auto-subs \
+    --restrict-filenames \
    --embed-thumbnail \
    --embed-chapters \
    --xattrs \
-    "${source_url}" -o "${temp_dir}/vod.mp4"
+    "${source_url}";

-msg "Extracting audio and resampling..."
+title_name="$(xargs basename -s .vod.mp4 < ${temp_filename})";

-ffmpeg -i "${temp_dir}/vod.mp4" \
+msg "Extracting audio and resampling...";
+
+ffmpeg -i "${temp_dir}/${title_name}.vod.mp4"  \
    -hide_banner \
+    -vn \
    -loglevel error \
    -ar 16000 \
    -ac 1 \
-    -c:a \
-    pcm_s16le -y "vod-resampled.wav"
+    -c:a pcm_s16le \
+    -y \
+    "${temp_dir}/${title_name}.vod-resampled.wav";

-msg "Transcribing to subtitle file..."
-msg "Whisper specified at: ${WHISPER_EXECUTABLE}"
+msg "Transcribing to subtitle file...";
+msg "Whisper specified at: '${WHISPER_EXECUTABLE}'";

-$WHISPER_EXECUTABLE \
+"${WHISPER_EXECUTABLE}" \
    -m "${MODEL_PATH}" \
    -l "${WHISPER_LANG}" \
-    -f "vod-resampled.wav" \
-    -t 8 \
+    -f "${temp_dir}/${title_name}.vod-resampled.wav" \
+    -t "${WHISPER_THREAD_COUNT}" \
    -osrt \
-    --translate
+    --translate;

-msg "Embedding subtitle track..."
+msg "Embedding subtitle track...";

-ffmpeg -i "${temp_dir}/vod.mp4" \
+ffmpeg -i "${temp_dir}/${title_name}.vod.mp4" \
    -hide_banner \
    -loglevel error \
-    -i "vod-resampled.wav.srt" \
+    -i "${temp_dir}/${title_name}.vod-resampled.wav.srt" \
    -c copy \
    -c:s mov_text \
-    -y res.mp4
+    -y "${title_name}-res.mp4";

-cleanup
+cleanup "${temp_dir}";

-msg "Done! Your finished file is ready: res.mp4"
+msg "Done! Your finished file is ready: ${title_name}-res.mp4";
--- a/whisper.cpp
+++ b/whisper.cpp
@ -592,6 +592,8 @@ struct whisper_context {

    mutable std::mt19937 rng; // used for sampling at t > 0.0

+    int lang_id;
+
    // [EXPERIMENTAL] token-level timestamps data
    int64_t t_beg;
    int64_t t_last;
@ -2903,7 +2905,7 @@ const char * whisper_print_system_info(void) {

 struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
    struct whisper_full_params result = {
-        /*.strategy         =*/ WHISPER_SAMPLING_GREEDY,
+        /*.strategy         =*/ strategy,

        /*.n_threads        =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
        /*.n_max_text_ctx   =*/ 16384,
@ -2922,6 +2924,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.thold_pt         =*/ 0.01f,
        /*.thold_ptsum      =*/ 0.01f,
        /*.max_len          =*/ 0,
+        /*.split_on_word    =*/ false,
        /*.max_tokens       =*/ 0,

        /*.speed_up         =*/ false,
@ -2933,6 +2936,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.language         =*/ "en",

        /*.suppress_blank   =*/ true,
+        /*.suppress_non_speech_tokens =*/true,

        /*.temperature      =*/  0.0f,
        /*.max_initial_ts   =*/  1.0f,
@ -2988,9 +2992,35 @@ static void whisper_exp_compute_token_level_timestamps(
                         float   thold_pt,
                         float   thold_ptsum);

+// trim from start (in place)
+static inline void ltrim(std::string &s) {
+    s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
+        return !std::isspace(ch);
+    }));
+}
+
+// trim from end (in place)
+static inline void rtrim(std::string &s) {
+    s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
+        return !std::isspace(ch);
+    }).base(), s.end());
+}
+
+// trim from both ends (in place)
+static inline void trim(std::string &s) {
+    rtrim(s);
+    ltrim(s);
+}
+
+static inline bool should_split_on_word(const char * txt, bool split_on_word) {
+    if (!split_on_word) return true;
+
+    return txt[0] == ' ';
+}
+
 // wrap the last segment to max_len characters
 // returns the number of new segments
-static int whisper_wrap_segment(struct whisper_context & ctx, int max_len) {
+static int whisper_wrap_segment(struct whisper_context & ctx, int max_len, bool split_on_word) {
    auto segment = ctx.result_all.back();

    int res = 1;
@ -3005,11 +3035,14 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len) {
        }

        const auto txt = whisper_token_to_str(&ctx, token.id);
-
        const int cur = strlen(txt);

-        if (acc + cur > max_len && i > 0) {
+        if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
            // split here
+            if (split_on_word) {
+                trim(text);
+            }
+
            ctx.result_all.back().text = std::move(text);
            ctx.result_all.back().t1 = token.t0;
            ctx.result_all.back().tokens.resize(i);
@ -3037,11 +3070,22 @@ static int whisper_wrap_segment(struct whisper_context & ctx, int max_len) {
        }
    }

+    if (split_on_word) {
+        trim(text);
+    }
    ctx.result_all.back().text = std::move(text);

    return res;
 }

+static const std::vector<std::string> non_speech_tokens
+{
+    "\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
+    "_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
+    "---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
+    "♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯"
+};
+
 // process the logits for the selected decoder
 // - applies logit filters
 // - computes logprobs and probs
@ -3102,6 +3146,33 @@ static void whisper_process_logits(
        logits[vocab.token_translate]  = -INFINITY;
        logits[vocab.token_transcribe] = -INFINITY;

+
+        // suppress non-speech tokens
+        // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
+        if (params.suppress_non_speech_tokens)
+        {
+            for (const std::string &token : non_speech_tokens)
+            {
+                std::string suppress_tokens[] = {token, " " + token};
+                for (const std::string &suppress_token : suppress_tokens)
+                {
+                    if (vocab.token_to_id.find(suppress_token) != vocab.token_to_id.end())
+                    {
+                        logits[vocab.token_to_id.at(suppress_token)] = -INFINITY;
+                    }
+                }
+            }
+            // allow hyphens "-" and single quotes "'" between words, but not at the beginning of a word
+            if (vocab.token_to_id.find(" -") != vocab.token_to_id.end())
+            {
+                logits[vocab.token_to_id.at(" -")] = -INFINITY;
+            }
+            if (vocab.token_to_id.find(" '") != vocab.token_to_id.end())
+            {
+                logits[vocab.token_to_id.at(" '")] = -INFINITY;
+            }
+        }
+
        // timestamps have to appear in pairs, except directly before EOT; mask logits accordingly
        // https://github.com/openai/whisper/blob/0b1ba3d46ebf7fe6f953acfd8cad62a4f851b49f/whisper/decoding.py#L414-L424
        {
@ -3449,7 +3520,7 @@ int whisper_full(
            fprintf(stderr, "%s: failed to auto-detect language\n", __func__);
            return -3;
        }
-
+        ctx->lang_id = lang_id;
        params.language = whisper_lang_str(lang_id);

        fprintf(stderr, "%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
@ -3546,6 +3617,7 @@ int whisper_full(
    std::vector<whisper_token> prompt_init = { whisper_token_sot(ctx) };
    if (whisper_is_multilingual(ctx)) {
        const int lang_id = whisper_lang_id(params.language);
+        ctx->lang_id = lang_id;
        prompt_init.push_back(whisper_token_lang(ctx, lang_id));
        if (params.translate) {
            prompt_init.push_back(whisper_token_translate());
@ -3793,7 +3865,7 @@ int whisper_full(

                        auto & cur = beam_candidates[cur_c++];

-                        while (beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) {
+                        while (beam_candidates.size() > cur_c && beam_candidates[cur_c].sequence.sum_logprobs_all == cur.sequence.sum_logprobs_all && i > 0) {
                            ++cur_c;
                        }

@ -4069,7 +4141,7 @@ int whisper_full(
                                        *ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum);

                                if (params.max_len > 0) {
-                                    n_new = whisper_wrap_segment(*ctx, params.max_len);
+                                    n_new = whisper_wrap_segment(*ctx, params.max_len, params.split_on_word);
                                }
                            }
                            if (params.new_segment_callback) {
@ -4113,7 +4185,7 @@ int whisper_full(
                                *ctx, result_all.size() - 1, params.thold_pt, params.thold_ptsum);

                        if (params.max_len > 0) {
-                            n_new = whisper_wrap_segment(*ctx, params.max_len);
+                            n_new = whisper_wrap_segment(*ctx, params.max_len, params.split_on_word);
                        }
                    }
                    if (params.new_segment_callback) {
@ -4266,6 +4338,10 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
    return ctx->result_all.size();
 }

+int whisper_full_lang_id(struct whisper_context * ctx) {
+    return ctx->lang_id; 
+}
+
 int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
    return ctx->result_all[i_segment].t0;
 }
--- a/whisper.h
+++ b/whisper.h
@ -113,6 +113,16 @@ extern "C" {
                               int   n_samples,
                               int   n_threads);

+    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2. 
+    // The resulting spectrogram is stored inside the provided whisper context.
+    // Returns 0 on success
+    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
+        struct whisper_context* ctx,
+        const float* samples,
+        int   n_samples,
+        int   n_threads);
+
+
    // This can be used to set a custom log mel spectrogram inside the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
    // n_mel must be 80
@ -257,6 +267,7 @@ extern "C" {
        float thold_pt;         // timestamp token probability threshold (~0.01)
        float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
        int   max_len;          // max segment length in characters
+        bool  split_on_word;    // split on word rather than on token (when used with max_len)
        int   max_tokens;       // max tokens per segment (0 = no limit)

        // [EXPERIMENTAL] speed-up techniques
@ -274,6 +285,7 @@ extern "C" {

        // common decoding parameters:
        bool suppress_blank;    // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
+        bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253

        float temperature;      // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
        float max_initial_ts;   // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
@ -329,6 +341,9 @@ extern "C" {
    // A segment can be a few words, a sentence, or even a paragraph.
    WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);

+    // Language id associated with the current context
+    WHISPER_API int whisper_full_lang_id(struct whisper_context * ctx);
+
    // Get the start and end time of the specified segment.
    WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
    WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
Author	SHA1	Message	Date
Georgi Gerganov	59c997ca2d	wip ignore	2023-02-15 19:11:12 +02:00
genevera (she/her)	459753342d	yt-wsp.sh : add unique filename generation (#495 ) Co-authored-by: genevera <genevera@noreply.users.github.com>	2023-02-14 20:12:51 +02:00
Georgi Gerganov	9764782bd9	readme : add another .NET repo (#303 )	2023-02-14 20:04:03 +02:00
Georgi Gerganov	3b010f9bed	readme : add .NET repo (#303 )	2023-02-11 17:35:33 +02:00
Avik Sengupta	113fcec513	cmake : install whisper.h header (#485 ) Including the header file in the install bundle helps projects that ship binaries.	2023-02-11 09:13:32 +02:00
shibukazu	cfc06bf8df	whisper : suppress non-speech-related token outputs (#473 ) * add non-speech-token suppression * add suppress non-speech_tokens param	2023-02-08 09:05:34 +02:00
sandrohanea	2bfe0ebc0f	whisper : fixed Beam Search Strategy and exposed whisper_pcm_to_mel_phase_vocoder (#474 ) Co-authored-by: Sandro Hanea <sandrohanea@microsoft.com>	2023-02-08 09:01:47 +02:00
boolemancer	4dd7119deb	whisper : only trim if split_on_word is true (#476 )	2023-02-08 08:43:23 +02:00
Qianhe Chen	ab1916fc59	ci : add node addon test and optimize compilation configuration (#468 ) * addon: implement node addon call whisper through cpp * addon: modify the license to MIT * addon: remove iostream * addon: rename dir * addon: fix typo * addon: configure cmake to build when cmake-js is used * ci: add addon.node test ci * addon: remove build WHISPER_BUILD_TESTS * addon: update build command * addon: add test * addon: add test file * addon: adapt to compile on Windows * addon: fix typo * addon: reuse jfk.wav Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * addon: reuse jfk.wav --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>	2023-02-05 15:02:08 +02:00
kamranjon	a1c1583cc7	whisper : add whisper_full_lang_id() for getting the context lang (#461 )	2023-02-05 14:46:26 +02:00
Matija Pevec	d012b5c7e4	whisper : add "split_on_word" flag when using using "max_len" option (#455 ) * Update whisper.cpp * fix: trim function * feat: added flag to split on word * fix: arguments for main	2023-02-05 14:44:23 +02:00