mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-04-15 14:58:36 +02:00
talk-llama : sync llama.cpp
This commit is contained in:
parent
518199c09e
commit
02b4c52c12
File diff suppressed because it is too large
Load Diff
@ -3,15 +3,7 @@
|
|||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#ifdef GGML_USE_CUBLAS
|
|
||||||
#include "ggml-cuda.h"
|
|
||||||
#define LLAMA_MAX_DEVICES GGML_CUDA_MAX_DEVICES
|
|
||||||
#elif defined(GGML_USE_SYCL)
|
|
||||||
#include "ggml-sycl.h"
|
|
||||||
#define LLAMA_MAX_DEVICES GGML_SYCL_MAX_DEVICES
|
|
||||||
#else
|
|
||||||
#define LLAMA_MAX_DEVICES 1
|
|
||||||
#endif // GGML_USE_CUBLAS
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
@ -49,11 +41,6 @@
|
|||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 4
|
#define LLAMA_SESSION_VERSION 4
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
|
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
|
||||||
#define LLAMA_SUPPORTS_GPU_OFFLOAD
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
@ -111,6 +98,7 @@ extern "C" {
|
|||||||
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_Q3_K_XS = 22, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_IQ3_XXS = 23, // except 1d tensors
|
||||||
|
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
@ -199,7 +187,7 @@ extern "C" {
|
|||||||
// LLAMA_SPLIT_LAYER: ignored
|
// LLAMA_SPLIT_LAYER: ignored
|
||||||
int32_t main_gpu;
|
int32_t main_gpu;
|
||||||
|
|
||||||
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
|
||||||
const float * tensor_split;
|
const float * tensor_split;
|
||||||
|
|
||||||
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||||
@ -225,7 +213,7 @@ extern "C" {
|
|||||||
uint32_t n_batch; // prompt processing maximum batch size
|
uint32_t n_batch; // prompt processing maximum batch size
|
||||||
uint32_t n_threads; // number of threads to use for generation
|
uint32_t n_threads; // number of threads to use for generation
|
||||||
uint32_t n_threads_batch; // number of threads to use for batch processing
|
uint32_t n_threads_batch; // number of threads to use for batch processing
|
||||||
int8_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
int32_t rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
|
||||||
|
|
||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
@ -336,9 +324,14 @@ extern "C" {
|
|||||||
|
|
||||||
LLAMA_API int64_t llama_time_us(void);
|
LLAMA_API int64_t llama_time_us(void);
|
||||||
|
|
||||||
LLAMA_API int32_t llama_max_devices(void);
|
LLAMA_API size_t llama_max_devices(void);
|
||||||
LLAMA_API bool llama_mmap_supported (void);
|
|
||||||
LLAMA_API bool llama_mlock_supported(void);
|
LLAMA_API bool llama_supports_mmap (void);
|
||||||
|
LLAMA_API bool llama_supports_mlock (void);
|
||||||
|
LLAMA_API bool llama_supports_gpu_offload(void);
|
||||||
|
|
||||||
|
LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
|
||||||
|
LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
|
||||||
|
|
||||||
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user