talk-llama : sync llama.cpp

This commit is contained in:
Georgi Gerganov
2024-12-05 14:30:33 +02:00
parent fbe66da0e5
commit f2c680f893
3 changed files with 358 additions and 252 deletions

View File

@ -185,7 +185,8 @@ extern "C" {
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
};
enum llama_pooling_type {
@ -272,6 +273,9 @@ extern "C" {
};
struct llama_model_params {
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
ggml_backend_dev_t * devices;
int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
@ -987,6 +991,9 @@ extern "C" {
char * buf,
int32_t length);
// Get list of built-in chat templates
LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
//
// Sampling API
//