whisper : reorganize source code + improve CMake (#2256)

* scripts : update sync [no ci]

* files : reorganize [no ci]

* sync : llama.cpp

* cmake : link math library

* cmake : build normal ggml library

* files : move headers to include

* objc : fix path to ggml-metal.h

* ci : fix WHISPER_CUDA -> GGML_CUDA

* scripts : sync LICENSE [no ci]
This commit is contained in:
Georgi Gerganov
2024-06-26 19:34:09 +03:00
committed by GitHub
parent bf4cb4abad
commit e30c679928
361 changed files with 176536 additions and 20466 deletions

View File

@ -174,6 +174,7 @@ extern "C" {
LLAMA_POOLING_TYPE_NONE = 0,
LLAMA_POOLING_TYPE_MEAN = 1,
LLAMA_POOLING_TYPE_CLS = 2,
LLAMA_POOLING_TYPE_LAST = 3,
};
enum llama_split_mode {
@ -293,7 +294,6 @@ extern "C" {
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
// (ignored if no pooling layer)
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
@ -786,6 +786,10 @@ extern "C" {
// Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
// Set whether to use causal attention or not
// If set to true, the model will only attend to the past tokens
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);