release : v1.2.1

make : add "-mcpu=native" when building for aarch64 (#532 )
readme : add pybind11 bindings (#538 )
2025-07-01 23:10:47 +02:00 · 2023-02-28 22:29:12 +02:00 · 2023-02-27 21:04:16 +02:00 · 2023-02-27 21:02:11 +02:00 · 2023-02-24 08:46:06 +02:00 · 2023-02-21 19:00:42 +02:00
11 changed files with 26 additions and 99 deletions
--- a/.gitignore
+++ b/.gitignore
@ -10,6 +10,7 @@ build-em/
 build-debug/
 build-release/
 build-static/
+build-no-accel/
 build-sanitize-addr/
 build-sanitize-thread/

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required (VERSION 3.0)

-project(whisper.cpp VERSION 1.2.0)
+project(whisper.cpp VERSION 1.2.1)

 # Add path to modules
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
--- a/2
+++ b/2
@ -141,6 +141,8 @@ ifdef WHISPER_GPROF
 	CXXFLAGS += -pg
 endif
 ifneq ($(filter aarch64%,$(UNAME_M)),)
+	CFLAGS += -mcpu=native
+	CXXFLAGS += -mcpu=native
 endif
 ifneq ($(filter armv6%,$(UNAME_M)),)
 	# Raspberry Pi 1, 2, 3
--- a/README.md
+++ b/README.md
@ -4,7 +4,7 @@
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)

-Stable: [v1.2.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
+Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)

 High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:

@ -469,7 +469,9 @@ in [models](models).
 - [X] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
  - [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
  - [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
- [ ] Python: soon | [WIP](https://github.com/ggerganov/whisper.cpp/issues/9)
+- [X] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
+  - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
+  - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)

 ## Examples

--- a/bindings/ios
+++ b/bindings/ios
--- a/bindings/javascript/package.json
+++ b/bindings/javascript/package.json
@ -1,6 +1,6 @@
 {
  "name": "whisper.cpp",
-  "version": "1.2.0",
+  "version": "1.2.1",
  "description": "Whisper speech recognition",
  "main": "whisper.js",
  "scripts": {
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -72,7 +72,7 @@ int timestamp_to_sample(int64_t t, int n_samples) {
    return std::max(0, std::min((int) n_samples - 1, (int) ((t*WHISPER_SAMPLE_RATE)/100)));
 }

-void whisper_print_segment(struct whisper_context * ctx, int n_new, void * user_data) {
+void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

@ -250,7 +250,7 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {

            // this callback is called on each new segment
            if (!wparams.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment;
+                wparams.new_segment_callback           = whisper_print_segment_callback;
                wparams.new_segment_callback_user_data = &user_data;
            }

--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -109,73 +109,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "\n");
 }

-struct whisper_logits_filter_user_data {
-    std::vector<std::string>                * allowed_commands;
-    std::vector<std::vector<whisper_token>> * allowed_tokens;
-};
-
-void whisper_logits_filter(
-            struct whisper_context * ctx,
-          const whisper_token_data * tokens,
-                               int   n_tokens,
-                             float * logits,
-                              void * user_data){
-    const auto & allowed_tokens = *((whisper_logits_filter_user_data *) user_data)->allowed_tokens;
-
-    printf("n_tokens = %d\n", n_tokens);
-    for (int i = 0; i < n_tokens; i++) {
-        printf(" - '%s' (%.2f)\n", whisper_token_to_str(ctx, tokens[i].id), logits[i]);
-    }
-
-    if (n_tokens == 0) {
-        return;
-    }
-
-    std::vector<std::pair<whisper_token, float>> pool;
-    for (int i = 0; i < (int) allowed_tokens.size(); i++) {
-        const int n = (int) allowed_tokens[i].size();
-        if (n_tokens > n) {
-            continue;
-        }
-
-        const whisper_token id = allowed_tokens[i][n_tokens - 1];
-        pool.push_back({ id, logits[id] });
-    }
-
-    if (pool.empty()) {
-        return;
-    }
-
-    printf("applying logits filter, pool size = %d\n", (int) pool.size());
-
-    const int ibeg = whisper_token_beg(ctx);
-
-    double sum_all = 0.0;
-    for (int i = 0; i < ibeg; ++i) {
-        if (logits[i] == -INFINITY) {
-            continue;
-        }
-        sum_all += logits[i];
-    }
-
-    double sum_pool = 0.0;
-    for (int i = 0; i < (int) pool.size(); ++i) {
-        sum_pool += pool[i].second;
-    }
-
-    printf("sum_all = %.2f, sum_pool = %.2f\n", sum_all, sum_pool);
-
-    for (int i = 0; i < ibeg; ++i) {
-        logits[i] = -INFINITY;
-    }
-
-    for (int i = 0; i < (int) pool.size(); ++i) {
-        //logits[pool[i].first] = pool[i].second / sum_pool * sum_all;
-        logits[pool[i].first] = pool[i].second;
-        printf(" - '%s' (%.2f)\n", whisper_token_to_str(ctx, pool[i].first), logits[pool[i].first]);
-    }
-}
-
 std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
    const auto t_start = std::chrono::high_resolution_clock::now();

@ -198,8 +131,6 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
    wparams.audio_ctx        = params.audio_ctx;
    wparams.speed_up         = params.speed_up;

-    wparams.temperature_inc  = -1.0f;
-
    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
    }
@ -403,31 +334,22 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
            wparams.translate        = params.translate;
            wparams.no_context       = true;
            wparams.single_segment   = true;
-            //wparams.max_tokens       = 1;
+            wparams.max_tokens       = 1;
            wparams.language         = params.language.c_str();
            wparams.n_threads        = params.n_threads;

            wparams.audio_ctx        = params.audio_ctx;
            wparams.speed_up         = params.speed_up;

-            wparams.temperature_inc  = -1.0f;
-
            wparams.prompt_tokens    = k_tokens.data();
            wparams.prompt_n_tokens  = k_tokens.size();

-            whisper_logits_filter_user_data user_data = { &allowed_commands, &allowed_tokens };
-
-            wparams.logits_filter_callback           = whisper_logits_filter;
-            wparams.logits_filter_callback_user_data = &user_data;
-
            // run the transformer and a single decoding pass
            if (whisper_full(ctx, wparams, pcmf32_cur.data(), pcmf32_cur.size()) != 0) {
                fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
                break;
            }

-            fprintf(stdout, "%s: text - '%s'\n", __func__, whisper_full_get_segment_text(ctx, 0));
-
            // estimate command probability
            // NOTE: not optimal
            {
@ -514,7 +436,7 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const

 // always-prompt mode
 // transcribe the voice into text after valid prompt
-int process_always_prompt(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
+int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
    bool is_running = true;
    bool ask_prompt = true;

@ -574,7 +496,7 @@ int process_always_prompt(struct whisper_context * ctx, audio_async & audio, con
                const float sim = similarity(prompt, k_prompt);

                //debug
-                //fprintf(stdout, "command size: %d, sim: %f\n", (int) command.size(), sim);
+                //fprintf(stdout, "command size: %i\n", command_length);

                if ((sim > 0.7f) && (command.size() > 0)) {
                    fprintf(stdout, "%s: Command '%s%s%s', (t = %d ms)\n", __func__, "\033[1m", command.c_str(), "\033[0m", (int) t_ms);
@ -754,7 +676,7 @@ int main(int argc, char ** argv) {
    if (!params.commands.empty()) {
        ret_val = process_command_list(ctx, audio, params);
    } else if (!params.prompt.empty()) {
-        ret_val = process_always_prompt(ctx, audio, params);
+        ret_val = always_prompt_transcription(ctx, audio, params);
    } else {
        ret_val = process_general_transcription(ctx, audio, params);
    }
--- a/examples/common-sdl.h
+++ b/examples/common-sdl.h
@ -1,13 +1,13 @@
 #pragma once

+#include <SDL.h>
+#include <SDL_audio.h>
+
 #include <atomic>
 #include <cstdint>
 #include <vector>
 #include <mutex>

-#include <SDL.h>
-#include <SDL_audio.h>
-
 //
 // SDL Audio capture
 //
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -193,7 +193,7 @@ struct whisper_print_user_data {
    const std::vector<std::vector<float>> * pcmf32s;
 };

-void whisper_print_segment(struct whisper_context * ctx, int n_new, void * user_data) {
+void whisper_print_segment_callback(struct whisper_context * ctx, int n_new, void * user_data) {
    const auto & params  = *((whisper_print_user_data *) user_data)->params;
    const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;

@ -597,7 +597,7 @@ int main(int argc, char ** argv) {

            // this callback is called on each new segment
            if (!wparams.print_realtime) {
-                wparams.new_segment_callback           = whisper_print_segment;
+                wparams.new_segment_callback           = whisper_print_segment_callback;
                wparams.new_segment_callback_user_data = &user_data;
            }

--- a/whisper.cpp
+++ b/whisper.cpp
@ -592,16 +592,16 @@ struct whisper_context {

    mutable std::mt19937 rng; // used for sampling at t > 0.0

-    int lang_id;
+    int lang_id = 0; // english by default

    // [EXPERIMENTAL] token-level timestamps data
-    int64_t t_beg;
-    int64_t t_last;
+    int64_t t_beg = 0;
+    int64_t t_last = 0;
    whisper_token tid_last;
    std::vector<float> energy; // PCM signal energy

    // [EXPERIMENTAL] speed-up techniques
-    int32_t exp_n_audio_ctx; // 0 - use default
+    int32_t exp_n_audio_ctx = 0; // 0 - use default

    void use_buf(struct ggml_context * ctx, int i) {
 #if defined(WHISPER_USE_SCRATCH)
@ -4339,7 +4339,7 @@ int whisper_full_n_segments(struct whisper_context * ctx) {
 }

 int whisper_full_lang_id(struct whisper_context * ctx) {
-    return ctx->lang_id; 
+    return ctx->lang_id;
 }

 int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment) {
Author	SHA1	Message	Date
Georgi Gerganov	ad1389003d	release : v1.2.1	2023-02-28 22:29:12 +02:00
FlippFuzz	f420de1322	make : add "-mcpu=native" when building for aarch64 (#532 )	2023-02-27 21:04:16 +02:00
Aaron Pham	d176160f6f	readme : add pybind11 bindings (#538 )	2023-02-27 21:02:11 +02:00
Georgi Gerganov	ca21f7ab16	readme : add cython bindings (#9 )	2023-02-24 08:46:06 +02:00
Georgi Gerganov	373043cabe	whisper : zero-initialize some more context variables Just in case	2023-02-21 19:00:42 +02:00
Finn Voorhees	fb4d0d470f	whisper : fix uninitialized exp_n_audio_ctx	2023-02-21 18:58:08 +02:00