ref #9 : add API documentation in whisper.h

This commit is contained in:
Georgi Gerganov 2022-10-08 18:09:56 +03:00
parent 5e563ef635
commit 9bbca3110f
2 changed files with 73 additions and 9 deletions

View File

@ -230,25 +230,25 @@ int main(int argc, char ** argv) {
// print result // print result
if (!wparams.print_realtime) { if (!wparams.print_realtime) {
fprintf(stderr, "\n"); printf("\n");
const int n_segments = whisper_full_n_segments(ctx); const int n_segments = whisper_full_n_segments(ctx);
for (int i = 0; i < n_segments; ++i) { for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(ctx, i); const char * text = whisper_full_get_segment_text(ctx, i);
if (params.no_timestamps) { if (params.no_timestamps) {
fprintf(stderr, "%s", text); printf("%s", text);
fflush(stdout); fflush(stdout);
} else { } else {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i); const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i); const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
fprintf(stderr, "[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text); printf("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
} }
} }
} }
fprintf(stderr, "\n"); printf("\n");
// output to text file // output to text file
if (params.output_txt) { if (params.output_txt) {

View File

@ -31,33 +31,81 @@ extern "C" {
// C interface // C interface
// //
// TODO: documentation will come soon //
// Basic usage:
//
// #include "whisper.h"
//
// ...
//
// struct whisper_context * ctx = whisper_init("/path/to/ggml-base.en.bin");
//
// if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
// fprintf(stderr, "failed to process audio\n");
// return 7;
// }
//
// const int n_segments = whisper_full_n_segments(ctx);
// for (int i = 0; i < n_segments; ++i) {
// const char * text = whisper_full_get_segment_text(ctx, i);
// printf("%s", text);
// }
//
// whisper_free(ctx);
//
// ...
//
// This is a demonstration of the most straightforward usage of the library.
// "pcmf32" contains the RAW audio data in 32-bit floating point format.
//
// The interface also allows for more fine-grained control over the computation, but it requires a deeper
// understanding of how the model works.
//
struct whisper_context; struct whisper_context;
typedef int whisper_token; typedef int whisper_token;
// Allocates all memory needed for the model and loads the model from the given file.
// Returns NULL on failure.
WHISPER_API struct whisper_context * whisper_init(const char * path_model); WHISPER_API struct whisper_context * whisper_init(const char * path_model);
// Frees all memory allocated by the model.
WHISPER_API void whisper_free(struct whisper_context * ctx); WHISPER_API void whisper_free(struct whisper_context * ctx);
// Convert RAW PCM audio to log mel spectrogram.
// The resulting spectrogram is stored inside the provided whisper context.
// Returns 0 on success
WHISPER_API int whisper_pcm_to_mel( WHISPER_API int whisper_pcm_to_mel(
struct whisper_context * ctx, struct whisper_context * ctx,
const float * samples, const float * samples,
int n_samples, int n_samples,
int n_threads); int n_threads);
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
// n_mel must be 80 // n_mel must be 80
// Returns 0 on success
WHISPER_API int whisper_set_mel( WHISPER_API int whisper_set_mel(
struct whisper_context * ctx, struct whisper_context * ctx,
const float * data, const float * data,
int n_len, int n_len,
int n_mel); int n_mel);
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
// offset can be used to specify the offset of the first frame in the spectrogram.
// Returns 0 on success
WHISPER_API int whisper_encode( WHISPER_API int whisper_encode(
struct whisper_context * ctx, struct whisper_context * ctx,
int offset, int offset,
int n_threads); int n_threads);
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
// Make sure to call whisper_encode() first.
// tokens + n_tokens is the provided context for the decoder.
// n_past is the number of tokens to use from previous decoder calls.
// Returns 0 on success
WHISPER_API int whisper_decode( WHISPER_API int whisper_decode(
struct whisper_context * ctx, struct whisper_context * ctx,
const whisper_token * tokens, const whisper_token * tokens,
@ -65,10 +113,15 @@ extern "C" {
int n_past, int n_past,
int n_threads); int n_threads);
// Token sampling methods.
// These are provided for convenience and can be used after each call to whisper_decode().
// You can also implement your own sampling method using the whisper_get_probs() function.
// whisper_sample_best() returns the token with the highest probability
// whisper_sample_timestamp() returns the most probable timestamp token
WHISPER_API whisper_token whisper_sample_best(struct whisper_context * ctx, bool need_timestamp); WHISPER_API whisper_token whisper_sample_best(struct whisper_context * ctx, bool need_timestamp);
WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx); WHISPER_API whisper_token whisper_sample_timestamp(struct whisper_context * ctx);
// return the id of the specified language, returns -1 if not found // Return the id of the specified language, returns -1 if not found
WHISPER_API int whisper_lang_id(const char * lang); WHISPER_API int whisper_lang_id(const char * lang);
WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length WHISPER_API int whisper_n_len (struct whisper_context * ctx); // mel length
@ -76,10 +129,13 @@ extern "C" {
WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx); WHISPER_API int whisper_n_text_ctx (struct whisper_context * ctx);
WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx); WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
// The probabilities for the next token
WHISPER_API float * whisper_get_probs(struct whisper_context * ctx); WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);
// Token Id -> String. Uses the vocabulary in the provided context
WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token); WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
// Special tokens
WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx); WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx); WHISPER_API whisper_token whisper_token_sot (struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx); WHISPER_API whisper_token whisper_token_prev(struct whisper_context * ctx);
@ -87,16 +143,19 @@ extern "C" {
WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx); WHISPER_API whisper_token whisper_token_not (struct whisper_context * ctx);
WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx); WHISPER_API whisper_token whisper_token_beg (struct whisper_context * ctx);
// Task tokens
WHISPER_API whisper_token whisper_token_translate (); WHISPER_API whisper_token whisper_token_translate ();
WHISPER_API whisper_token whisper_token_transcribe(); WHISPER_API whisper_token whisper_token_transcribe();
// Performance information
WHISPER_API void whisper_print_timings(struct whisper_context * ctx); WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
//////////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////
// Available decoding strategies
enum whisper_decode_strategy { enum whisper_decode_strategy {
WHISPER_DECODE_GREEDY, WHISPER_DECODE_GREEDY, // Always select the most probable token
WHISPER_DECODE_BEAM_SEARCH, WHISPER_DECODE_BEAM_SEARCH, // TODO: not implemented yet!
}; };
struct whisper_full_params { struct whisper_full_params {
@ -129,18 +188,23 @@ extern "C" {
WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_decode_strategy strategy); WHISPER_API struct whisper_full_params whisper_full_default_params(enum whisper_decode_strategy strategy);
// full whisper run - encode + decode // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
// Uses the specified decoding strategy to obtain the text.
WHISPER_API int whisper_full( WHISPER_API int whisper_full(
struct whisper_context * ctx, struct whisper_context * ctx,
struct whisper_full_params params, struct whisper_full_params params,
const float * samples, const float * samples,
int n_samples); int n_samples);
// Number of generated text segments.
// A segment can be a few words, a sentence, or even a paragraph.
WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx); WHISPER_API int whisper_full_n_segments(struct whisper_context * ctx);
// Get the start and end time of the specified segment.
WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment); WHISPER_API int64_t whisper_full_get_segment_t0(struct whisper_context * ctx, int i_segment);
WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment); WHISPER_API int64_t whisper_full_get_segment_t1(struct whisper_context * ctx, int i_segment);
// Get the text of the specified segment.
WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment); WHISPER_API const char * whisper_full_get_segment_text(struct whisper_context * ctx, int i_segment);
#ifdef __cplusplus #ifdef __cplusplus