whisper : remove speed_up and phase_vocoder* functions (#2198)

* whisper : fix cast warning * whisper : remove phase_vocoder functions, ref #2195 * whisper : remove speed_up from whisper_full_params, closes #2195
2025-06-24 19:51:26 +02:00 · 2024-05-31 11:37:29 +03:00 · 2024-05-31 11:37:29 +03:00 · af5833e298
commit af5833e298
parent b87494bb8f
20 changed files with 14 additions and 161 deletions
--- a/bindings/go/examples/go-whisper/flags.go
+++ b/bindings/go/examples/go-whisper/flags.go
@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string {
 	return strings.ToLower(flags.Lookup("out").Value.String())
 }

-func (flags *Flags) IsSpeedup() bool {
-	return flags.Lookup("speedup").Value.String() == "true"
-}
-
 func (flags *Flags) IsTokens() bool {
 	return flags.Lookup("tokens").Value.String() == "true"
 }
@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error {
 		fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
 		context.SetDuration(duration)
 	}
-	if flags.IsSpeedup() {
-		fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
-		context.SetSpeedup(true)
-	}
 	if threads := flags.GetThreads(); threads != 0 {
 		fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
 		context.SetThreads(threads)
@ -146,7 +138,6 @@ func registerFlags(flag *Flags) {
 	flag.Duration("offset", 0, "Time offset")
 	flag.Duration("duration", 0, "Duration of audio to process")
 	flag.Uint("threads", 0, "Number of threads to use")
-	flag.Bool("speedup", false, "Enable speedup")
 	flag.Uint("max-len", 0, "Maximum segment length in characters")
 	flag.Uint("max-tokens", 0, "Maximum tokens per segment")
 	flag.Float64("word-thold", 0, "Maximum segment score")
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
 	p.print_timestamps = toBool(v)
 }

-func (p *Params) SetSpeedup(v bool) {
-	p.speed_up = toBool(v)
-}
-
 // Set language id
 func (p *Params) SetLanguage(lang int) error {
 	if lang == -1 {
@ -177,9 +173,6 @@ func (p *Params) String() string {
 	if p.token_timestamps {
 		str += " token_timestamps"
 	}
-	if p.speed_up {
-		str += " speed_up"
-	}

 	return str + ">"
 }
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {
 	context.params.SetTranslate(v)
 }

-// Set speedup flag
-func (context *context) SetSpeedup(v bool) {
-	context.params.SetSpeedup(v)
-}
-
 func (context *context) SetSplitOnWord(v bool) {
 	context.params.SetSplitOnWord(v)
 }
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@ -41,7 +41,6 @@ type Context interface {
 	SetOffset(time.Duration)        // Set offset
 	SetDuration(time.Duration)      // Set duration
 	SetThreads(uint)                // Set number of threads to use
-	SetSpeedup(bool)                // Set speedup flag
 	SetSplitOnWord(bool)            // Set split on word flag
 	SetTokenThreshold(float32)      // Set timestamp token probability threshold
 	SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library {
     * @return Whisper context on success, null on failure
     */
    Pointer whisper_init_from_file(String path_model);
-    
+
    /**
     * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
     * Because this function allocates memory for the params, the caller must call either:
@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library {
    /** Language id associated with the provided state */
    int whisper_full_lang_id_from_state(Pointer state);

-    /**
-     * Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-     * The resulting spectrogram is stored inside the default state of the provided whisper context.
-     * @return 0 on success
-     */
-    int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads);
-
-    int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);

    /** Get the start time of the specified segment. */
    long whisper_full_get_segment_t0(Pointer ctx, int i_segment);
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
@ -129,14 +129,6 @@ public class WhisperFullParams extends Structure {
    /** Maximum tokens per segment (0, default = no limit) */
    public int max_tokens;

-    /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
-    public CBool speed_up;
-
-    /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */
-    public void speedUp(boolean enable) {
-        speed_up = enable ? CBool.TRUE : CBool.FALSE;
-    }
-
    /** Overwrite the audio context size (0 = use default). */
    public int audio_ctx;

@ -321,7 +313,7 @@ public class WhisperFullParams extends Structure {
        return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
                "no_context", "single_segment", "no_timestamps",
                "print_special", "print_progress", "print_realtime", "print_timestamps",  "token_timestamps",
-                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",
+                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
                "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
                "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",
                "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
 static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
  BOOL_PARAMS_SETTER(self, split_on_word, value)
 }
-static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
-  BOOL_PARAMS_GETTER(self, speed_up)
-}
-static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, speed_up, value)
-}
 static VALUE ruby_whisper_params_get_diarize(VALUE self) {
  ruby_whisper_params *rwp;
  Data_Get_Struct(self, ruby_whisper_params, rwp);
@ -408,8 +402,6 @@ void Init_whisper() {
  rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
  rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
  rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
-  rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
-  rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
  rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
  rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);

--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@ -117,13 +117,6 @@ class TestWhisper < Test::Unit::TestCase
    assert !@params.split_on_word
  end

-  def test_speed_up
-    @params.speed_up = true
-    assert @params.speed_up
-    @params.speed_up = false
-    assert !@params.speed_up
-  end
-
  def test_whisper
    @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
    params  = Whisper::Params.new
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@ -25,7 +25,6 @@ struct whisper_params {
    float entropy_thold = 2.4f;
    float logprob_thold = -1.0f;

-    bool speed_up       = false;
    bool translate      = false;
    bool diarize        = false;
    bool output_txt     = false;
@ -232,8 +231,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
            wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
            wparams.audio_ctx        = params.audio_ctx;

-            wparams.speed_up         = params.speed_up;
-
            wparams.greedy.best_of        = params.best_of;
            wparams.beam_search.beam_size = params.beam_size;

--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@ -38,7 +38,6 @@ struct whisper_params {

    grammar_parser::parse_state grammar_parsed;

-    bool speed_up      = false;
    bool translate     = false;
    bool print_special = false;
    bool print_energy  = false;
@ -76,7 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@ -115,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@ -165,7 +162,6 @@ std::string transcribe(
    wparams.n_threads        = params.n_threads;

    wparams.audio_ctx = params.audio_ctx;
-    wparams.speed_up  = params.speed_up;

    wparams.temperature     = 0.4f;
    wparams.temperature_inc = 1.0f;
@ -371,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
            wparams.n_threads        = params.n_threads;

            wparams.audio_ctx        = params.audio_ctx;
-            wparams.speed_up         = params.speed_up;

            wparams.prompt_tokens    = k_tokens.data();
            wparams.prompt_n_tokens  = k_tokens.size();
--- a/examples/common.h
+++ b/examples/common.h
@ -185,7 +185,7 @@ private:
    // It is assumed that PCM data is normalized to a range from -1 to 1
    bool write_audio(const float * data, size_t length) {
        for (size_t i = 0; i < length; ++i) {
-            const int16_t intSample = data[i] * 32767;
+            const int16_t intSample = int16_t(data[i] * 32767);
            file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
            dataSize += sizeof(int16_t);
        }
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@ -26,7 +26,6 @@ struct whisper_params {
    float vad_thold    = 0.6f;
    float freq_thold   = 100.0f;

-    bool speed_up      = false;
    bool translate     = false;
    bool print_special = false;
    bool print_energy  = false;
@ -70,7 +69,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@ -102,7 +100,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@ -184,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
    wparams.n_threads        = params.n_threads;

    wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
    wparams.suppress_non_speech_tokens = true;
    // run the transformer and a single decoding pass
    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
@ -223,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
    wparams.n_threads        = params.n_threads;

    wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;

    // TODO: Do some time testing. Does an overly long prompt slow down processing?
    // Set up command sets/precompute prompts
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -47,7 +47,6 @@ struct whisper_params {
    float temperature     = 0.0f;
    float temperature_inc = 0.2f;

-    bool speed_up        = false;
    bool debug_mode      = false;
    bool translate       = false;
    bool detect_language = false;
@ -138,7 +137,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
        else if (arg == "-tp"   || arg == "--temperature")     { params.temperature     = std::stof(argv[++i]); }
        else if (arg == "-tpi"  || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
@ -206,7 +204,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
    fprintf(stderr, "  -tp,       --temperature N     [%-7.2f] The sampling temperature, between 0 and 1\n",    params.temperature);
    fprintf(stderr, "  -tpi,      --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@ -1106,7 +1103,6 @@ int main(int argc, char ** argv) {
            wparams.split_on_word    = params.split_on_word;
            wparams.audio_ctx        = params.audio_ctx;

-            wparams.speed_up         = params.speed_up;
            wparams.debug_mode       = params.debug_mode;

            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -61,7 +61,6 @@ struct whisper_params {
    float temperature     =  0.00f;
    float temperature_inc =  0.20f;

-    bool speed_up        = false;
    bool debug_mode      = false;
    bool translate       = false;
    bool detect_language = false;
@ -112,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
    fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
    fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
    fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
    fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
    fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@ -159,7 +157,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
        else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
        else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
        else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
        else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
        else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
        else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
@ -768,7 +765,6 @@ int main(int argc, char ** argv) {
            wparams.split_on_word    = params.split_on_word;
            wparams.audio_ctx        = params.audio_ctx;

-            wparams.speed_up         = params.speed_up;
            wparams.debug_mode       = params.debug_mode;

            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@ -27,7 +27,6 @@ struct whisper_params {
    float vad_thold    = 0.6f;
    float freq_thold   = 100.0f;

-    bool speed_up      = false;
    bool translate     = false;
    bool no_fallback   = false;
    bool print_special = false;
@ -62,7 +61,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ac"   || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth"  || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth"  || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"   || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-nf"   || arg == "--no-fallback")   { params.no_fallback   = true; }
        else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
@ -100,7 +98,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",           params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                   params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
    fprintf(stderr, "  -nf,      --no-fallback   [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
@ -314,7 +311,6 @@ int main(int argc, char ** argv) {
            wparams.n_threads        = params.n_threads;

            wparams.audio_ctx        = params.audio_ctx;
-            wparams.speed_up         = params.speed_up;

            wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]

--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@ -59,7 +59,6 @@ struct whisper_params {
    float vad_thold  = 0.6f;
    float freq_thold = 100.0f;

-    bool speed_up       = false;
    bool translate      = false;
    bool print_special  = false;
    bool print_energy   = false;
@ -100,7 +99,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ngl" || arg == "--n-gpu-layers")   { params.n_gpu_layers   = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")      { params.vad_thold      = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")     { params.freq_thold     = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")       { params.speed_up       = true; }
        else if (arg == "-tr"  || arg == "--translate")      { params.translate      = true; }
        else if (arg == "-ps"  || arg == "--print-special")  { params.print_special  = true; }
        else if (arg == "-pe"  || arg == "--print-energy")   { params.print_energy   = true; }
@ -149,7 +147,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7d] number of layers to store in VRAM\n",           params.n_gpu_layers);
    fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,      --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@ -205,7 +202,6 @@ std::string transcribe(
    wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();

    wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;

    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@ -26,7 +26,6 @@ struct whisper_params {
    float vad_thold    = 0.6f;
    float freq_thold   = 100.0f;

-    bool speed_up      = false;
    bool translate     = false;
    bool print_special = false;
    bool print_energy  = false;
@ -60,7 +59,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@ -96,7 +94,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@ -132,7 +129,6 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
    wparams.n_threads        = params.n_threads;

    wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;

    if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
        return "";
--- a/examples/wchess/wchess.cmd/wchess.cmd.cpp
+++ b/examples/wchess/wchess.cmd/wchess.cmd.cpp
@ -26,7 +26,6 @@ struct whisper_params {

    float grammar_penalty = 100.0f;

-    bool speed_up      = false;
    bool translate     = false;
    bool print_special = false;
    bool print_energy  = false;
@ -57,7 +56,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
    fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
    fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
    fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
    fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
    fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
    fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@ -89,7 +87,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
        else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
        else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
        else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
        else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
        else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
        else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
--- a/whisper.cpp
+++ b/whisper.cpp
@ -2868,13 +2868,10 @@ struct whisper_global_cache {
    // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
    // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
    float hann_window[WHISPER_N_FFT];
-    float hann_window2x[WHISPER_N_FFT * 2];

    whisper_global_cache() {
        fill_sin_cos_table();
-#define FILL_HANN_WINDOW(arr) fill_hann_window(sizeof(arr) / sizeof(arr[0]), true, arr)
-        FILL_HANN_WINDOW(hann_window);
-        FILL_HANN_WINDOW(hann_window2x);
+        fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window);
    }

    void fill_sin_cos_table() {
@ -2885,7 +2882,7 @@ struct whisper_global_cache {
        }
    }

-    void fill_hann_window(int length, bool periodic, float* output) {
+    void fill_hann_window(int length, bool periodic, float * output) {
        int offset = -1;
        if (periodic) {
            offset = 0;
@ -3061,15 +3058,8 @@ static bool log_mel_spectrogram(
    const int64_t t_start_us = ggml_time_us();

    // Hann window
-    const float * hann = nullptr;
-    if (frame_size == WHISPER_N_FFT) {
-        hann = global_cache.hann_window;
-    } else if (frame_size == 2 * WHISPER_N_FFT) {
-        hann = global_cache.hann_window2x;
-    } else {
-        WHISPER_ASSERT(false && "Unsupported frame_size");
-        return false;
-    }
+    WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
+    const float * hann = global_cache.hann_window;

    // Calculate the length of padding
    int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
@ -3752,30 +3742,6 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
    return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
 }

-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
-int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
-        WHISPER_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
-int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
-    return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
-}
-
-// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
-// TODO
-
-// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2
-// TODO
-
-// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2
-// TODO
-
 int whisper_set_mel_with_state(
        struct whisper_context * ctx,
          struct whisper_state * state,
@ -4676,7 +4642,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
        /*.split_on_word     =*/ false,
        /*.max_tokens        =*/ 0,

-        /*.speed_up          =*/ false,
        /*.debug_mode        =*/ false,
        /*.audio_ctx         =*/ 0,

@ -5350,15 +5315,9 @@ int whisper_full_with_state(

    if (n_samples > 0) {
        // compute log mel spectrogram
-        if (params.speed_up) {
-            // TODO: Replace PV with more advanced algorithm
+        if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
            WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
-            return -1;
-        } else {
-            if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
-                WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
-                return -2;
-            }
+            return -2;
        }
    }

@ -5395,7 +5354,7 @@ int whisper_full_with_state(
    // if length of spectrogram is less than 1.0s (100 frames), then return
    // basically don't process anything that is less than 1.0s
    // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
-    if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
+    if (seek_end < seek_start + 100) {
        WHISPER_LOG_WARN("%s: input is too short - %d ms < 1000 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
        return 0;
    }
@ -6107,8 +6066,8 @@ int whisper_full_with_state(
                        const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));

                        if (!text.empty()) {
-                            const auto tt0 = params.speed_up ? 2*t0 : t0;
-                            const auto tt1 = params.speed_up ? 2*t1 : t1;
+                            const auto tt0 = t0;
+                            const auto tt1 = t1;

                            if (params.print_realtime) {
                                if (params.print_timestamps) {
@ -6154,8 +6113,8 @@ int whisper_full_with_state(
                if (!text.empty()) {
                    const auto t1 = seek + seek_delta;

-                    const auto tt0 = params.speed_up ? 2*t0 : t0;
-                    const auto tt1 = params.speed_up ? 2*t1 : t1;
+                    const auto tt0 = t0;
+                    const auto tt1 = t1;

                    if (params.print_realtime) {
                        if (params.print_timestamps) {
--- a/whisper.h
+++ b/whisper.h
@ -266,22 +266,6 @@ extern "C" {
                               int   n_samples,
                               int   n_threads);

-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
-    // Returns 0 on success
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
-        struct whisper_context * ctx,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-
    // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
    // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
    // n_mel must be 80
@ -499,7 +483,6 @@ extern "C" {

        // [EXPERIMENTAL] speed-up techniques
        // note: these can significantly reduce the quality of the output
-        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
        bool debug_mode;        // enable debug_mode provides extra info (eg. Dump log_mel)
        int  audio_ctx;         // overwrite the audio context size (0 = use default)