diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp index d2db0b8..040ba9e 100644 --- a/examples/stream/stream.cpp +++ b/examples/stream/stream.cpp @@ -322,6 +322,7 @@ int main(int argc, char ** argv) { { whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY); + wparams.max_tokens = 32; wparams.print_progress = false; wparams.print_special_tokens = params.print_special_tokens; wparams.print_realtime = false; diff --git a/whisper.cpp b/whisper.cpp index 95579ec..48f93eb 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -2402,6 +2402,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str /*.thold_pt =*/ 0.01f, /*.thold_ptsum =*/ 0.01f, /*.max_len =*/ 0, + /*.max_tokens =*/ 0, /*.speed_up =*/ false, @@ -2443,6 +2444,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str /*.thold_pt =*/ 0.01f, /*.thold_ptsum =*/ 0.01f, /*.max_len =*/ 0, + /*.max_tokens =*/ 0, /*.speed_up =*/ false, @@ -2685,7 +2687,7 @@ int whisper_full( //} // end of text token - if (token.id == whisper_token_eot(ctx) || (i > WHISPER_EXPERIMENT_MAX_TOKENS_PER_SEGMENT)) { + if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) { if (result_len == 0) { if (seek + seek_delta + 100 >= seek_end) { result_len = i + 1; diff --git a/whisper.h b/whisper.h index ec4b1fb..0211995 100644 --- a/whisper.h +++ b/whisper.h @@ -25,7 +25,6 @@ #define WHISPER_CHUNK_SIZE 30 #define WHISPER_EXPERIMENT_AUDIO_CTX 512 -#define WHISPER_EXPERIMENT_MAX_TOKENS_PER_SEGMENT 32 #ifdef __cplusplus extern "C" { @@ -205,6 +204,7 @@ extern "C" { float thold_pt; // timestamp token probability threshold (~0.01) float thold_ptsum; // timestamp token sum probability threshold (~0.01) int max_len; // max segment length in characters + int max_tokens; // max tokens per segment (0 = no limit) // [EXPERIMENTAL] speed-up techniques bool speed_up; // speed-up the audio by 2x using Phase Vocoder