mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-05-17 14:41:10 +02:00
talk-llama : sync llama.cpp
ggml-ci
This commit is contained in:
parent
a14c89aefa
commit
f890560575
@ -20,6 +20,7 @@ if (WHISPER_SDL2)
|
|||||||
llama-memory.cpp
|
llama-memory.cpp
|
||||||
llama-mmap.cpp
|
llama-mmap.cpp
|
||||||
llama-model-loader.cpp
|
llama-model-loader.cpp
|
||||||
|
llama-model-saver.cpp
|
||||||
llama-model.cpp
|
llama-model.cpp
|
||||||
llama-quant.cpp
|
llama-quant.cpp
|
||||||
llama-sampling.cpp
|
llama-sampling.cpp
|
||||||
|
@ -253,6 +253,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|||||||
std::vector<ggml_backend_buffer_type_t> buft_extra;
|
std::vector<ggml_backend_buffer_type_t> buft_extra;
|
||||||
{
|
{
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (!cpu_dev) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||||
|
|
||||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||||
@ -291,6 +294,9 @@ static void llama_adapter_lora_init_impl(llama_model & model, const char * path_
|
|||||||
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
|
LLAMA_LOG_WARN("%s: lora for '%s' cannot use buft '%s', fallback to CPU\n", __func__, model_tensor->name, ggml_backend_buft_name(buft));
|
||||||
|
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (!cpu_dev) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||||
|
|
||||||
break;
|
break;
|
||||||
|
@ -189,7 +189,7 @@ llama_ubatch llama_sbatch::split_seq(size_t n_ubatch) {
|
|||||||
return ubatch;
|
return ubatch;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
llama_sbatch::llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split, bool logits_all) {
|
||||||
GGML_ASSERT(batch.n_tokens >= 0);
|
GGML_ASSERT(batch.n_tokens >= 0);
|
||||||
this->batch = &batch;
|
this->batch = &batch;
|
||||||
this->n_embd = n_embd;
|
this->n_embd = n_embd;
|
||||||
@ -203,6 +203,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|||||||
for (size_t i = 0; i < n_tokens; ++i) {
|
for (size_t i = 0; i < n_tokens; ++i) {
|
||||||
ids[i] = i;
|
ids[i] = i;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (simple_split) {
|
if (simple_split) {
|
||||||
seq.resize(1);
|
seq.resize(1);
|
||||||
llama_sbatch_seq & s = seq[0];
|
llama_sbatch_seq & s = seq[0];
|
||||||
@ -212,6 +213,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|||||||
s.length = n_tokens;
|
s.length = n_tokens;
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::sort(ids.begin(), ids.end(),
|
std::sort(ids.begin(), ids.end(),
|
||||||
[&batch](size_t a, size_t b) {
|
[&batch](size_t a, size_t b) {
|
||||||
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
int32_t n_seq_a = batch.n_seq_id ? batch.n_seq_id[a] : 1;
|
||||||
@ -239,6 +241,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|||||||
return n_seq_a > n_seq_b;
|
return n_seq_a > n_seq_b;
|
||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
// init seq
|
// init seq
|
||||||
llama_sbatch_seq * last_seq = nullptr;
|
llama_sbatch_seq * last_seq = nullptr;
|
||||||
|
|
||||||
@ -262,6 +265,7 @@ void llama_sbatch::from_batch(const llama_batch & batch, size_t n_embd, bool sim
|
|||||||
seq.push_back(new_seq);
|
seq.push_back(new_seq);
|
||||||
last_seq = &seq.back();
|
last_seq = &seq.back();
|
||||||
}
|
}
|
||||||
|
|
||||||
// keep shared prompts first at the end, then sort by length descending.
|
// keep shared prompts first at the end, then sort by length descending.
|
||||||
std::sort(seq.begin(), seq.end(),
|
std::sort(seq.begin(), seq.end(),
|
||||||
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
[](llama_sbatch_seq & a, llama_sbatch_seq & b) {
|
||||||
|
@ -70,7 +70,8 @@ struct llama_sbatch {
|
|||||||
// sequence-wise split
|
// sequence-wise split
|
||||||
llama_ubatch split_seq(size_t n_ubatch);
|
llama_ubatch split_seq(size_t n_ubatch);
|
||||||
|
|
||||||
void from_batch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
llama_sbatch() = default;
|
||||||
|
llama_sbatch(const llama_batch & batch, size_t n_embd, bool simple_split = false, bool logits_all = false);
|
||||||
};
|
};
|
||||||
|
|
||||||
// temporary allocate memory for the input batch if needed
|
// temporary allocate memory for the input batch if needed
|
||||||
|
@ -35,6 +35,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
|||||||
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
{ "mistral-v3", LLM_CHAT_TEMPLATE_MISTRAL_V3 },
|
||||||
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
{ "mistral-v3-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN },
|
||||||
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
{ "mistral-v7", LLM_CHAT_TEMPLATE_MISTRAL_V7 },
|
||||||
|
{ "mistral-v7-tekken", LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN },
|
||||||
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
{ "phi3", LLM_CHAT_TEMPLATE_PHI_3 },
|
||||||
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
{ "phi4", LLM_CHAT_TEMPLATE_PHI_4 },
|
||||||
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
{ "falcon3", LLM_CHAT_TEMPLATE_FALCON_3 },
|
||||||
@ -202,19 +203,20 @@ int32_t llm_chat_apply_template(
|
|||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|im_start|>assistant\n";
|
ss << "<|im_start|>assistant\n";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 || tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN) {
|
||||||
// Official mistral 'v7' template
|
// Official mistral 'v7' template
|
||||||
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
// See: https://huggingface.co/mistralai/Mistral-Large-Instruct-2411#basic-instruct-template-v7
|
||||||
|
// https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503#basic-instruct-template-v7-tekken
|
||||||
|
const char * trailing_space = tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V7 ? " " : "";
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
std::string content(message->content);
|
std::string content(message->content);
|
||||||
if (role == "system") {
|
if (role == "system") {
|
||||||
ss << "[SYSTEM_PROMPT] " << content << "[/SYSTEM_PROMPT]";
|
ss << "[SYSTEM_PROMPT]" << trailing_space << content << "[/SYSTEM_PROMPT]";
|
||||||
} else if (role == "user") {
|
} else if (role == "user") {
|
||||||
ss << "[INST] " << content << "[/INST]";
|
ss << "[INST]" << trailing_space << content << "[/INST]";
|
||||||
}
|
} else {
|
||||||
else {
|
ss << trailing_space << content << "</s>";
|
||||||
ss << " " << content << "</s>";
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
} else if (tmpl == LLM_CHAT_TEMPLATE_MISTRAL_V1
|
||||||
@ -447,8 +449,16 @@ int32_t llm_chat_apply_template(
|
|||||||
if (add_ass) {
|
if (add_ass) {
|
||||||
ss << "<|assistant|>";
|
ss << "<|assistant|>";
|
||||||
}
|
}
|
||||||
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4 || tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
} else if (tmpl == LLM_CHAT_TEMPLATE_CHATGLM_4) {
|
||||||
ss << "[gMASK]" << "<sop>";
|
ss << "[gMASK]" << "<sop>";
|
||||||
|
for (auto message : chat) {
|
||||||
|
std::string role(message->role);
|
||||||
|
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||||
|
}
|
||||||
|
if (add_ass) {
|
||||||
|
ss << "<|assistant|>\n";
|
||||||
|
}
|
||||||
|
} else if (tmpl == LLM_CHAT_TEMPLATE_GLMEDGE) {
|
||||||
for (auto message : chat) {
|
for (auto message : chat) {
|
||||||
std::string role(message->role);
|
std::string role(message->role);
|
||||||
ss << "<|" << role << "|>" << "\n" << message->content;
|
ss << "<|" << role << "|>" << "\n" << message->content;
|
||||||
|
@ -14,6 +14,7 @@ enum llm_chat_template {
|
|||||||
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
LLM_CHAT_TEMPLATE_MISTRAL_V3,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
LLM_CHAT_TEMPLATE_MISTRAL_V3_TEKKEN,
|
||||||
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
LLM_CHAT_TEMPLATE_MISTRAL_V7,
|
||||||
|
LLM_CHAT_TEMPLATE_MISTRAL_V7_TEKKEN,
|
||||||
LLM_CHAT_TEMPLATE_PHI_3,
|
LLM_CHAT_TEMPLATE_PHI_3,
|
||||||
LLM_CHAT_TEMPLATE_PHI_4,
|
LLM_CHAT_TEMPLATE_PHI_4,
|
||||||
LLM_CHAT_TEMPLATE_FALCON_3,
|
LLM_CHAT_TEMPLATE_FALCON_3,
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -7,6 +7,7 @@
|
|||||||
#include "llama-adapter.h"
|
#include "llama-adapter.h"
|
||||||
|
|
||||||
#include "ggml-cpp.h"
|
#include "ggml-cpp.h"
|
||||||
|
#include "ggml-opt.h"
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
@ -28,6 +29,11 @@ struct llama_context {
|
|||||||
void synchronize();
|
void synchronize();
|
||||||
|
|
||||||
const llama_model & get_model() const;
|
const llama_model & get_model() const;
|
||||||
|
const llama_cparams & get_cparams() const;
|
||||||
|
|
||||||
|
ggml_backend_sched_t get_sched() const;
|
||||||
|
|
||||||
|
ggml_context * get_ctx_compute() const;
|
||||||
|
|
||||||
uint32_t n_ctx() const;
|
uint32_t n_ctx() const;
|
||||||
uint32_t n_ctx_per_seq() const;
|
uint32_t n_ctx_per_seq() const;
|
||||||
@ -128,6 +134,32 @@ struct llama_context {
|
|||||||
llama_perf_context_data perf_get_data() const;
|
llama_perf_context_data perf_get_data() const;
|
||||||
void perf_reset();
|
void perf_reset();
|
||||||
|
|
||||||
|
//
|
||||||
|
// training
|
||||||
|
//
|
||||||
|
|
||||||
|
void opt_init(struct llama_model * model, struct llama_opt_params lopt_params);
|
||||||
|
|
||||||
|
void opt_epoch(
|
||||||
|
ggml_opt_dataset_t dataset,
|
||||||
|
ggml_opt_result_t result_train,
|
||||||
|
ggml_opt_result_t result_eval,
|
||||||
|
int64_t idata_split,
|
||||||
|
ggml_opt_epoch_callback callback_train,
|
||||||
|
ggml_opt_epoch_callback callback_eval);
|
||||||
|
|
||||||
|
void opt_epoch_iter(
|
||||||
|
ggml_opt_dataset_t dataset,
|
||||||
|
ggml_opt_result_t result,
|
||||||
|
const std::vector<llama_token> & tokens,
|
||||||
|
const std::vector<llama_token> & labels_sparse,
|
||||||
|
llama_batch & batch,
|
||||||
|
ggml_opt_epoch_callback callback,
|
||||||
|
bool train,
|
||||||
|
int64_t idata_in_loop,
|
||||||
|
int64_t ndata_in_loop,
|
||||||
|
int64_t t_loop_start);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
//
|
//
|
||||||
// output
|
// output
|
||||||
@ -137,49 +169,30 @@ private:
|
|||||||
// Returns max number of outputs for which space was reserved.
|
// Returns max number of outputs for which space was reserved.
|
||||||
int32_t output_reserve(int32_t n_outputs);
|
int32_t output_reserve(int32_t n_outputs);
|
||||||
|
|
||||||
// make the outputs have the same order they had in the user-provided batch
|
|
||||||
// TODO: maybe remove this
|
|
||||||
void output_reorder();
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// graph
|
// graph
|
||||||
//
|
//
|
||||||
|
|
||||||
|
public:
|
||||||
int32_t graph_max_nodes() const;
|
int32_t graph_max_nodes() const;
|
||||||
|
|
||||||
// zero-out inputs and create the ctx_compute for the compute graph
|
// zero-out inputs and create the ctx_compute for the compute graph
|
||||||
ggml_cgraph * graph_init();
|
ggml_cgraph * graph_init();
|
||||||
|
|
||||||
llm_graph_result_ptr graph_build(
|
|
||||||
ggml_context * ctx,
|
|
||||||
ggml_cgraph * gf,
|
|
||||||
const llama_ubatch & ubatch,
|
|
||||||
llm_graph_type gtype);
|
|
||||||
|
|
||||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||||
ggml_status graph_compute(
|
ggml_status graph_compute(
|
||||||
ggml_cgraph * gf,
|
ggml_cgraph * gf,
|
||||||
bool batched);
|
bool batched);
|
||||||
|
|
||||||
|
private:
|
||||||
|
llm_graph_result_ptr graph_build(
|
||||||
|
ggml_context * ctx,
|
||||||
|
ggml_cgraph * gf,
|
||||||
|
const llama_ubatch & ubatch,
|
||||||
|
llm_graph_type gtype);
|
||||||
|
|
||||||
llm_graph_cb graph_get_cb() const;
|
llm_graph_cb graph_get_cb() const;
|
||||||
|
|
||||||
// used by kv_self_update()
|
|
||||||
ggml_tensor * build_rope_shift(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
ggml_tensor * cur,
|
|
||||||
ggml_tensor * shift,
|
|
||||||
ggml_tensor * factors,
|
|
||||||
float freq_base,
|
|
||||||
float freq_scale) const;
|
|
||||||
|
|
||||||
llm_graph_result_ptr build_kv_self_shift(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
ggml_cgraph * gf) const;
|
|
||||||
|
|
||||||
llm_graph_result_ptr build_kv_self_defrag(
|
|
||||||
ggml_context * ctx0,
|
|
||||||
ggml_cgraph * gf) const;
|
|
||||||
|
|
||||||
// TODO: read/write lora adapters and cvec
|
// TODO: read/write lora adapters and cvec
|
||||||
size_t state_write_data(llama_io_write_i & io);
|
size_t state_write_data(llama_io_write_i & io);
|
||||||
size_t state_read_data (llama_io_read_i & io);
|
size_t state_read_data (llama_io_read_i & io);
|
||||||
@ -196,14 +209,10 @@ private:
|
|||||||
llama_cparams cparams;
|
llama_cparams cparams;
|
||||||
llama_adapter_cvec cvec;
|
llama_adapter_cvec cvec;
|
||||||
llama_adapter_loras loras;
|
llama_adapter_loras loras;
|
||||||
llama_sbatch sbatch;
|
|
||||||
|
|
||||||
llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
|
llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
|
||||||
|
|
||||||
std::unique_ptr<llama_kv_cache_unified> kv_self;
|
std::unique_ptr<llama_memory_i> memory;
|
||||||
|
|
||||||
// TODO: remove
|
|
||||||
bool logits_all = false;
|
|
||||||
|
|
||||||
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
// decode output (2-dimensional array: [n_outputs][n_vocab])
|
||||||
size_t logits_size = 0; // capacity (of floats) for logits
|
size_t logits_size = 0; // capacity (of floats) for logits
|
||||||
@ -230,6 +239,9 @@ private:
|
|||||||
|
|
||||||
ggml_context_ptr ctx_compute;
|
ggml_context_ptr ctx_compute;
|
||||||
|
|
||||||
|
// training
|
||||||
|
ggml_opt_context_t opt_ctx = nullptr;
|
||||||
|
|
||||||
ggml_threadpool_t threadpool = nullptr;
|
ggml_threadpool_t threadpool = nullptr;
|
||||||
ggml_threadpool_t threadpool_batch = nullptr;
|
ggml_threadpool_t threadpool_batch = nullptr;
|
||||||
|
|
||||||
|
@ -30,6 +30,7 @@ struct llama_cparams {
|
|||||||
bool flash_attn;
|
bool flash_attn;
|
||||||
bool no_perf;
|
bool no_perf;
|
||||||
bool warmup;
|
bool warmup;
|
||||||
|
bool op_offload;
|
||||||
|
|
||||||
enum llama_pooling_type pooling_type;
|
enum llama_pooling_type pooling_type;
|
||||||
|
|
||||||
|
@ -284,24 +284,7 @@ void llm_graph_input_s_copy::set_input(const llama_ubatch * ubatch) {
|
|||||||
|
|
||||||
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
// assuming copy destinations ALWAYS happen ONLY on the cells between head and head+n
|
||||||
for (uint32_t i = 0; i < n_kv; ++i) {
|
for (uint32_t i = 0; i < n_kv; ++i) {
|
||||||
const uint32_t cell_id = i + kv_self->head;
|
data[i] = kv_self->s_copy(i);
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
// TODO: this should not mutate the KV cache !
|
|
||||||
llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
|
|
||||||
|
|
||||||
// prevent out-of-bound sources
|
|
||||||
if (kv_cell.src < 0 || (uint32_t) kv_cell.src >= kv_self->size) {
|
|
||||||
kv_cell.src = cell_id;
|
|
||||||
}
|
|
||||||
|
|
||||||
data[i] = kv_cell.src;
|
|
||||||
|
|
||||||
// TODO: do not mutate the KV cache
|
|
||||||
// ensure copy only happens once
|
|
||||||
if (kv_cell.src != (int32_t) cell_id) {
|
|
||||||
kv_cell.src = cell_id;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -317,18 +300,7 @@ void llm_graph_input_s_mask::set_input(const llama_ubatch * ubatch) {
|
|||||||
|
|
||||||
// clear unused states
|
// clear unused states
|
||||||
for (int i = 0; i < n_kv; ++i) {
|
for (int i = 0; i < n_kv; ++i) {
|
||||||
const uint32_t cell_id = i + kv_self->head;
|
data[i] = kv_self->s_mask(i);
|
||||||
|
|
||||||
//////////////////////////////////////////////
|
|
||||||
// TODO: this should not mutate the KV cache !
|
|
||||||
llama_kv_cell & kv_cell = const_cast<class llama_kv_cache_unified *>(kv_self)->cells[i];
|
|
||||||
|
|
||||||
data[i] = (float) (kv_cell.src >= 0);
|
|
||||||
|
|
||||||
// only clear once
|
|
||||||
if (kv_cell.src < 0) {
|
|
||||||
kv_cell.src = cell_id;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -810,7 +782,7 @@ ggml_tensor * llm_graph_context::build_ffn(
|
|||||||
} break;
|
} break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type_gate == LLM_FFN_PAR) {
|
if (gate && type_gate == LLM_FFN_PAR) {
|
||||||
cur = ggml_mul(ctx0, cur, tmp);
|
cur = ggml_mul(ctx0, cur, tmp);
|
||||||
cb(cur, "ffn_gate_par", il);
|
cb(cur, "ffn_gate_par", il);
|
||||||
}
|
}
|
||||||
@ -999,6 +971,7 @@ ggml_tensor * llm_graph_context::build_inp_embd(ggml_tensor * tok_embd) const {
|
|||||||
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
inp->tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ubatch.n_tokens);
|
||||||
//cb(inp->tokens, "inp_tokens", -1);
|
//cb(inp->tokens, "inp_tokens", -1);
|
||||||
ggml_set_input(inp->tokens);
|
ggml_set_input(inp->tokens);
|
||||||
|
res->t_tokens = inp->tokens;
|
||||||
|
|
||||||
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
|
cur = ggml_get_rows(ctx0, tok_embd, inp->tokens);
|
||||||
|
|
||||||
@ -1105,7 +1078,7 @@ ggml_tensor * llm_graph_context::build_inp_cls() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
|
auto inp = std::make_unique<llm_graph_input_s_copy>(kv_self);
|
||||||
|
|
||||||
@ -1122,7 +1095,7 @@ ggml_tensor * llm_graph_context::build_inp_s_copy() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
ggml_tensor * llm_graph_context::build_inp_s_mask() const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
|
auto inp = std::make_unique<llm_graph_input_s_mask>(kv_self);
|
||||||
|
|
||||||
@ -1255,8 +1228,19 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
|||||||
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
|
||||||
|
|
||||||
if (v_mla) {
|
if (v_mla) {
|
||||||
|
#if 0
|
||||||
|
// v_mla can be applied as a matrix-vector multiplication with broadcasting across dimension 3 == n_tokens.
|
||||||
|
// However, the code is optimized for dimensions 0 and 1 being large, so this is ineffient.
|
||||||
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
|
cur = ggml_reshape_4d(ctx0, cur, v_mla->ne[0], 1, n_head, n_tokens);
|
||||||
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||||
|
#else
|
||||||
|
// It's preferable to do the calculation as a matrix-matrix multiplication with n_tokens in dimension 1.
|
||||||
|
// The permutations are noops and only change how the tensor data is interpreted.
|
||||||
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
|
cur = ggml_mul_mat(ctx0, v_mla, cur);
|
||||||
|
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
||||||
|
cur = ggml_cont(ctx0, cur); // Needed because ggml_reshape_2d expects contiguous inputs.
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
||||||
@ -1436,8 +1420,6 @@ ggml_tensor * llm_graph_context::build_attn(
|
|||||||
|
|
||||||
// store to KV cache
|
// store to KV cache
|
||||||
{
|
{
|
||||||
GGML_ASSERT(!kv_self->recurrent);
|
|
||||||
|
|
||||||
const auto kv_head = kv_self->head;
|
const auto kv_head = kv_self->head;
|
||||||
|
|
||||||
GGML_ASSERT(kv_self->size == n_ctx);
|
GGML_ASSERT(kv_self->size == n_ctx);
|
||||||
@ -1587,7 +1569,7 @@ ggml_tensor * llm_graph_context::build_copy_mask_state(
|
|||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
int32_t n_state,
|
int32_t n_state,
|
||||||
int32_t n_seqs) const {
|
int32_t n_seqs) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto n_kv = kv_self->n;
|
const auto n_kv = kv_self->n;
|
||||||
const auto kv_head = kv_self->head;
|
const auto kv_head = kv_self->head;
|
||||||
@ -1619,7 +1601,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
|||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto token_shift_count = hparams.token_shift_count;
|
const auto token_shift_count = hparams.token_shift_count;
|
||||||
|
|
||||||
@ -1640,7 +1622,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_store(
|
|||||||
ggml_tensor * token_shift,
|
ggml_tensor * token_shift,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto token_shift_count = hparams.token_shift_count;
|
const auto token_shift_count = hparams.token_shift_count;
|
||||||
const auto n_embd = hparams.n_embd;
|
const auto n_embd = hparams.n_embd;
|
||||||
|
@ -19,6 +19,7 @@ struct llama_cparams;
|
|||||||
|
|
||||||
class llama_memory_i;
|
class llama_memory_i;
|
||||||
class llama_kv_cache_unified;
|
class llama_kv_cache_unified;
|
||||||
|
class llama_kv_cache_recurrent;
|
||||||
|
|
||||||
// certain models (typically multi-modal) can produce different types of graphs
|
// certain models (typically multi-modal) can produce different types of graphs
|
||||||
enum llm_graph_type {
|
enum llm_graph_type {
|
||||||
@ -186,26 +187,26 @@ public:
|
|||||||
|
|
||||||
class llm_graph_input_s_copy : public llm_graph_input_i {
|
class llm_graph_input_s_copy : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
llm_graph_input_s_copy(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
||||||
virtual ~llm_graph_input_s_copy() = default;
|
virtual ~llm_graph_input_s_copy() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * s_copy; // I32 [kv_size]
|
ggml_tensor * s_copy; // I32 [kv_size]
|
||||||
|
|
||||||
const llama_kv_cache_unified * kv_self;
|
const llama_kv_cache_recurrent * kv_self;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_s_mask : public llm_graph_input_i {
|
class llm_graph_input_s_mask : public llm_graph_input_i {
|
||||||
public:
|
public:
|
||||||
llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
|
llm_graph_input_s_mask(const llama_kv_cache_recurrent * kv_self) : kv_self(kv_self) {}
|
||||||
virtual ~llm_graph_input_s_mask() = default;
|
virtual ~llm_graph_input_s_mask() = default;
|
||||||
|
|
||||||
void set_input(const llama_ubatch * ubatch) override;
|
void set_input(const llama_ubatch * ubatch) override;
|
||||||
|
|
||||||
ggml_tensor * s_mask; // F32 [1, n_kv]
|
ggml_tensor * s_mask; // F32 [1, n_kv]
|
||||||
|
|
||||||
const llama_kv_cache_unified * kv_self;
|
const llama_kv_cache_recurrent * kv_self;
|
||||||
};
|
};
|
||||||
|
|
||||||
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
||||||
@ -297,6 +298,7 @@ class llm_graph_result_i {
|
|||||||
public:
|
public:
|
||||||
virtual ~llm_graph_result_i() = default;
|
virtual ~llm_graph_result_i() = default;
|
||||||
|
|
||||||
|
virtual ggml_tensor * get_tokens() = 0;
|
||||||
virtual ggml_tensor * get_logits() = 0;
|
virtual ggml_tensor * get_logits() = 0;
|
||||||
virtual ggml_tensor * get_embd() = 0;
|
virtual ggml_tensor * get_embd() = 0;
|
||||||
virtual ggml_tensor * get_embd_pooled() = 0;
|
virtual ggml_tensor * get_embd_pooled() = 0;
|
||||||
@ -311,6 +313,7 @@ class llm_graph_result : public llm_graph_result_i {
|
|||||||
public:
|
public:
|
||||||
virtual ~llm_graph_result() = default;
|
virtual ~llm_graph_result() = default;
|
||||||
|
|
||||||
|
ggml_tensor * get_tokens() override { return t_tokens; }
|
||||||
ggml_tensor * get_logits() override { return t_logits; }
|
ggml_tensor * get_logits() override { return t_logits; }
|
||||||
ggml_tensor * get_embd() override { return t_embd; }
|
ggml_tensor * get_embd() override { return t_embd; }
|
||||||
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
||||||
@ -327,6 +330,7 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
// important graph nodes
|
// important graph nodes
|
||||||
|
ggml_tensor * t_tokens = nullptr;
|
||||||
ggml_tensor * t_logits = nullptr;
|
ggml_tensor * t_logits = nullptr;
|
||||||
ggml_tensor * t_embd = nullptr;
|
ggml_tensor * t_embd = nullptr;
|
||||||
ggml_tensor * t_embd_pooled = nullptr;
|
ggml_tensor * t_embd_pooled = nullptr;
|
||||||
@ -350,8 +354,8 @@ struct llm_graph_params {
|
|||||||
const llama_cparams & cparams;
|
const llama_cparams & cparams;
|
||||||
const llama_ubatch & ubatch;
|
const llama_ubatch & ubatch;
|
||||||
|
|
||||||
ggml_backend_sched * sched;
|
ggml_backend_sched_t sched;
|
||||||
ggml_backend * backend_cpu;
|
ggml_backend_t backend_cpu;
|
||||||
|
|
||||||
const llama_adapter_cvec * cvec;
|
const llama_adapter_cvec * cvec;
|
||||||
const llama_adapter_loras * loras;
|
const llama_adapter_loras * loras;
|
||||||
@ -402,9 +406,9 @@ struct llm_graph_context {
|
|||||||
|
|
||||||
ggml_context * ctx0 = nullptr;
|
ggml_context * ctx0 = nullptr;
|
||||||
|
|
||||||
ggml_backend_sched * sched;
|
ggml_backend_sched_t sched;
|
||||||
|
|
||||||
ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||||
|
|
||||||
const llama_adapter_cvec * cvec;
|
const llama_adapter_cvec * cvec;
|
||||||
const llama_adapter_loras * loras;
|
const llama_adapter_loras * loras;
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -2,32 +2,72 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "llama-io.h"
|
#include "llama-io.h"
|
||||||
|
#include "llama-graph.h"
|
||||||
#include "llama-memory.h"
|
#include "llama-memory.h"
|
||||||
|
|
||||||
#include "ggml-cpp.h"
|
#include "ggml-cpp.h"
|
||||||
|
|
||||||
#include <functional>
|
|
||||||
#include <set>
|
#include <set>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
struct llama_cparams;
|
struct llama_cparams;
|
||||||
struct llama_hparams;
|
struct llama_hparams;
|
||||||
struct llama_ubatch;
|
struct llama_ubatch;
|
||||||
|
struct llama_sbatch;
|
||||||
|
struct llama_model;
|
||||||
|
struct llama_context;
|
||||||
|
|
||||||
struct llama_kv_cache : public llama_memory_i {
|
struct llama_kv_cache : public llama_memory_i {
|
||||||
using llama_memory_i::llama_memory_i;
|
virtual ~llama_kv_cache() = default;
|
||||||
|
|
||||||
virtual void restore() = 0; // call if batch processing fails - restores the cache state
|
// call if batch processing fails - restores the cache state
|
||||||
virtual void commit() = 0; // call after successful batch processing - clears any pending state
|
virtual void restore() = 0;
|
||||||
|
|
||||||
|
// call after successful batch processing - clears any pending state
|
||||||
|
virtual void commit() = 0;
|
||||||
|
|
||||||
|
// process any pending defrag/shift/etc. operations
|
||||||
|
// optionally call once before processing a new batch
|
||||||
|
virtual bool update(llama_context & lctx) = 0;
|
||||||
|
|
||||||
|
// schedule a defrag if the fragmentation threshold is exceeded. otherwise, do nothing
|
||||||
|
virtual void defrag_sched(float thold) = 0;
|
||||||
|
|
||||||
|
// simulate full cache, used for allocating worst-case compute buffers
|
||||||
|
virtual void set_full() = 0;
|
||||||
|
|
||||||
|
//
|
||||||
|
// batch processing
|
||||||
|
//
|
||||||
|
|
||||||
|
virtual llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) = 0;
|
||||||
|
|
||||||
|
// different KV caches require different batch splitting strategies
|
||||||
|
virtual llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const = 0;
|
||||||
|
|
||||||
|
// find an empty slot of size "n_tokens" in the cache
|
||||||
|
virtual bool find_slot(const llama_ubatch & batch) = 0;
|
||||||
|
|
||||||
|
// getters
|
||||||
virtual int32_t get_n_tokens() const = 0;
|
virtual int32_t get_n_tokens() const = 0;
|
||||||
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
virtual int32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
|
||||||
|
virtual llama_pos get_pos_max() const = 0;
|
||||||
virtual bool get_can_shift() const = 0;
|
virtual bool get_can_shift() const = 0;
|
||||||
|
|
||||||
bool get_can_edit() const override { return get_can_shift(); }
|
bool get_can_edit() const override { return get_can_shift(); }
|
||||||
|
|
||||||
|
//
|
||||||
|
// state write/read
|
||||||
|
//
|
||||||
|
|
||||||
|
virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
|
||||||
|
virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// llama_kv_cache_guard
|
||||||
|
//
|
||||||
|
|
||||||
struct llama_kv_cache_guard {
|
struct llama_kv_cache_guard {
|
||||||
llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
|
llama_kv_cache_guard(llama_kv_cache * kv) : kv(kv) {}
|
||||||
|
|
||||||
@ -43,10 +83,190 @@ private:
|
|||||||
llama_kv_cache * kv;
|
llama_kv_cache * kv;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llama_kv_cell {
|
//
|
||||||
|
// llama_kv_cache_unified
|
||||||
|
//
|
||||||
|
|
||||||
|
// TODO: add notion of max sequences
|
||||||
|
class llama_kv_cache_unified : public llama_kv_cache {
|
||||||
|
public:
|
||||||
|
struct kv_cell {
|
||||||
llama_pos pos = -1;
|
llama_pos pos = -1;
|
||||||
llama_pos delta = 0;
|
llama_pos delta = 0;
|
||||||
int32_t src = -1; // used by recurrent state models to copy states
|
|
||||||
|
std::set<llama_seq_id> seq_id;
|
||||||
|
|
||||||
|
bool has_seq_id(const llama_seq_id & id) const {
|
||||||
|
return seq_id.find(id) != seq_id.end();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_empty() const {
|
||||||
|
return seq_id.empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_same_seq(const kv_cell & other) const {
|
||||||
|
return seq_id == other.seq_id;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static uint32_t get_padding(const llama_cparams & cparams);
|
||||||
|
|
||||||
|
llama_kv_cache_unified(
|
||||||
|
const llama_model & model,
|
||||||
|
ggml_type type_k,
|
||||||
|
ggml_type type_v,
|
||||||
|
bool v_trans,
|
||||||
|
bool offload,
|
||||||
|
uint32_t kv_size,
|
||||||
|
uint32_t padding);
|
||||||
|
|
||||||
|
~llama_kv_cache_unified() = default;
|
||||||
|
|
||||||
|
//
|
||||||
|
// llama_memory_i
|
||||||
|
//
|
||||||
|
|
||||||
|
void clear() override;
|
||||||
|
|
||||||
|
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||||
|
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||||
|
void seq_keep(llama_seq_id seq_id) override;
|
||||||
|
void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) override;
|
||||||
|
void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) override;
|
||||||
|
|
||||||
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||||
|
|
||||||
|
//
|
||||||
|
// llama_kv_cache
|
||||||
|
//
|
||||||
|
|
||||||
|
void restore() override;
|
||||||
|
void commit() override;
|
||||||
|
|
||||||
|
bool update(llama_context & ctx) override;
|
||||||
|
|
||||||
|
void defrag_sched(float thold) override;
|
||||||
|
|
||||||
|
void set_full() override;
|
||||||
|
|
||||||
|
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
|
||||||
|
|
||||||
|
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
|
||||||
|
|
||||||
|
// updates the cache head
|
||||||
|
// Note: On success, it's important that cache.head points
|
||||||
|
// to the first cell of the slot.
|
||||||
|
bool find_slot(const llama_ubatch & batch) override;
|
||||||
|
|
||||||
|
int32_t get_n_tokens() const override;
|
||||||
|
int32_t get_used_cells() const override;
|
||||||
|
|
||||||
|
// TODO: better data structures to reduce the cost of this operation
|
||||||
|
llama_pos get_pos_max() const override;
|
||||||
|
|
||||||
|
bool get_can_shift() const override;
|
||||||
|
|
||||||
|
// state write/load
|
||||||
|
|
||||||
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||||
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||||
|
|
||||||
|
// Note: The value of head isn't only used to optimize searching
|
||||||
|
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||||
|
// cannot be freely changed after a slot has been allocated.
|
||||||
|
uint32_t head = 0;
|
||||||
|
uint32_t size = 0;
|
||||||
|
uint32_t used = 0; // used cells (i.e. at least one seq_id)
|
||||||
|
|
||||||
|
// computed before each graph build
|
||||||
|
uint32_t n = 0;
|
||||||
|
|
||||||
|
std::vector<kv_cell> cells;
|
||||||
|
|
||||||
|
std::vector<ggml_tensor *> k_l; // per layer
|
||||||
|
std::vector<ggml_tensor *> v_l;
|
||||||
|
|
||||||
|
private:
|
||||||
|
const llama_model & model;
|
||||||
|
const llama_hparams & hparams;
|
||||||
|
|
||||||
|
bool has_shift = false;
|
||||||
|
bool do_defrag = false;
|
||||||
|
|
||||||
|
bool v_trans = true; // the value tensor is transposed
|
||||||
|
bool can_shift = false;
|
||||||
|
|
||||||
|
// required padding
|
||||||
|
uint32_t padding = 1;
|
||||||
|
|
||||||
|
ggml_type type_k = GGML_TYPE_F16;
|
||||||
|
ggml_type type_v = GGML_TYPE_F16;
|
||||||
|
|
||||||
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
|
|
||||||
|
// defrag
|
||||||
|
struct {
|
||||||
|
std::vector<uint32_t> ids;
|
||||||
|
} defrag_info;
|
||||||
|
|
||||||
|
// return true if cells have been moved
|
||||||
|
bool defrag_prepare(int32_t n_max_nodes);
|
||||||
|
|
||||||
|
// commit/restore cache
|
||||||
|
struct slot_range {
|
||||||
|
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||||
|
uint32_t c1 = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// pending cell updates that are not yet committed
|
||||||
|
struct {
|
||||||
|
std::vector<slot_range> ranges;
|
||||||
|
} pending;
|
||||||
|
|
||||||
|
// find how many cells are currently in use
|
||||||
|
uint32_t cell_max() const;
|
||||||
|
|
||||||
|
size_t total_size() const;
|
||||||
|
|
||||||
|
size_t size_k_bytes() const;
|
||||||
|
size_t size_v_bytes() const;
|
||||||
|
|
||||||
|
ggml_tensor * build_rope_shift(
|
||||||
|
const llama_cparams & cparams,
|
||||||
|
ggml_context * ctx,
|
||||||
|
ggml_tensor * cur,
|
||||||
|
ggml_tensor * shift,
|
||||||
|
ggml_tensor * factors,
|
||||||
|
float freq_base,
|
||||||
|
float freq_scale) const;
|
||||||
|
|
||||||
|
llm_graph_result_ptr build_graph_shift(
|
||||||
|
const llama_cparams & cparams,
|
||||||
|
ggml_context * ctx,
|
||||||
|
ggml_cgraph * gf) const;
|
||||||
|
|
||||||
|
llm_graph_result_ptr build_graph_defrag(
|
||||||
|
const llama_cparams & cparams,
|
||||||
|
ggml_context * ctx,
|
||||||
|
ggml_cgraph * gf) const;
|
||||||
|
|
||||||
|
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||||
|
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||||
|
|
||||||
|
bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
||||||
|
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||||
|
};
|
||||||
|
|
||||||
|
//
|
||||||
|
// llama_kv_cache_recurrent
|
||||||
|
//
|
||||||
|
|
||||||
|
class llama_kv_cache_recurrent : public llama_kv_cache {
|
||||||
|
public:
|
||||||
|
struct kv_cell {
|
||||||
|
llama_pos pos = -1;
|
||||||
|
int32_t src = -1; // used to copy states
|
||||||
int32_t tail = -1;
|
int32_t tail = -1;
|
||||||
|
|
||||||
std::set<llama_seq_id> seq_id;
|
std::set<llama_seq_id> seq_id;
|
||||||
@ -59,49 +279,25 @@ struct llama_kv_cell {
|
|||||||
return seq_id.empty();
|
return seq_id.empty();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_same_seq(const llama_kv_cell & other) const {
|
bool is_same_seq(const kv_cell & other) const {
|
||||||
return seq_id == other.seq_id;
|
return seq_id == other.seq_id;
|
||||||
}
|
}
|
||||||
};
|
|
||||||
|
|
||||||
// ring-buffer of cached KV data
|
|
||||||
// TODO: pimpl
|
|
||||||
// TODO: add notion of max sequences
|
|
||||||
class llama_kv_cache_unified : public llama_kv_cache {
|
|
||||||
public:
|
|
||||||
// can be used to query data from the model if needed
|
|
||||||
struct callbacks {
|
|
||||||
std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
llama_kv_cache_unified(
|
llama_kv_cache_recurrent(
|
||||||
const llama_hparams & hparams,
|
const llama_model & model,
|
||||||
callbacks cbs);
|
|
||||||
|
|
||||||
virtual ~llama_kv_cache_unified() = default;
|
|
||||||
|
|
||||||
// TODO: become constructor
|
|
||||||
bool init(
|
|
||||||
const llama_model & model, // TODO: do not reference the model
|
|
||||||
const llama_cparams & cparams,
|
|
||||||
ggml_type type_k,
|
ggml_type type_k,
|
||||||
ggml_type type_v,
|
ggml_type type_v,
|
||||||
uint32_t kv_size,
|
bool offload,
|
||||||
bool offload);
|
uint32_t kv_size);
|
||||||
|
|
||||||
int32_t get_n_tokens() const override;
|
~llama_kv_cache_recurrent() = default;
|
||||||
int32_t get_used_cells() const override;
|
|
||||||
|
|
||||||
size_t total_size() const;
|
//
|
||||||
|
// llama_memory_i
|
||||||
// TODO: better data structures to reduce the cost of this operation
|
//
|
||||||
llama_pos pos_max() const;
|
|
||||||
|
|
||||||
void clear() override;
|
void clear() override;
|
||||||
void defrag() override;
|
|
||||||
|
|
||||||
virtual void restore() override;
|
|
||||||
virtual void commit() override;
|
|
||||||
|
|
||||||
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) override;
|
||||||
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
|
||||||
@ -111,63 +307,41 @@ public:
|
|||||||
|
|
||||||
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
llama_pos seq_pos_max(llama_seq_id seq_id) const override;
|
||||||
|
|
||||||
|
//
|
||||||
|
// llama_kv_cache
|
||||||
|
//
|
||||||
|
|
||||||
|
void restore() override;
|
||||||
|
void commit() override;
|
||||||
|
|
||||||
|
bool update(llama_context & lctx) override;
|
||||||
|
|
||||||
|
void defrag_sched(float thold) override;
|
||||||
|
|
||||||
|
void set_full() override;
|
||||||
|
|
||||||
|
llama_sbatch sbatch_init(const llama_batch & batch, bool logits_all) override;
|
||||||
|
|
||||||
|
llama_ubatch ubatch_next(llama_sbatch & sbatch, uint32_t n_ubatch, bool embd_pooled) const override;
|
||||||
|
|
||||||
|
bool find_slot(const llama_ubatch & batch) override;
|
||||||
|
|
||||||
|
int32_t get_n_tokens() const override;
|
||||||
|
int32_t get_used_cells() const override;
|
||||||
|
|
||||||
|
// TODO: better data structures to reduce the cost of this operation
|
||||||
|
llama_pos get_pos_max() const override;
|
||||||
|
|
||||||
bool get_can_shift() const override;
|
bool get_can_shift() const override;
|
||||||
|
|
||||||
// find an empty slot of size "n_tokens" in the cache
|
// TODO: temporary methods - they are not really const as they do const_cast<>, fix this
|
||||||
// updates the cache head
|
int32_t s_copy(int i) const;
|
||||||
// Note: On success, it's important that cache.head points
|
float s_mask(int i) const;
|
||||||
// to the first cell of the slot.
|
|
||||||
bool find_slot(const llama_ubatch & batch);
|
|
||||||
|
|
||||||
// TODO: maybe not needed
|
|
||||||
uint32_t get_padding(const llama_cparams & cparams) const;
|
|
||||||
|
|
||||||
// find how many cells are currently in use
|
|
||||||
uint32_t cell_max() const;
|
|
||||||
|
|
||||||
size_t size_k_bytes() const;
|
|
||||||
size_t size_v_bytes() const;
|
|
||||||
|
|
||||||
// defrag
|
|
||||||
|
|
||||||
struct {
|
|
||||||
std::vector<uint32_t> ids;
|
|
||||||
} defrag_info;
|
|
||||||
|
|
||||||
// return true if cells have been moved
|
|
||||||
bool defrag_prepare(int32_t n_max_nodes);
|
|
||||||
|
|
||||||
// commit/restore cache
|
|
||||||
|
|
||||||
struct slot_range {
|
|
||||||
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
|
||||||
uint32_t c1 = 0;
|
|
||||||
};
|
|
||||||
|
|
||||||
// pending cell updates that are not yet committed
|
|
||||||
struct {
|
|
||||||
std::vector<slot_range> ranges;
|
|
||||||
} pending;
|
|
||||||
|
|
||||||
// state write/load
|
// state write/load
|
||||||
|
|
||||||
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
|
void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const override;
|
||||||
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1);
|
void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) override;
|
||||||
|
|
||||||
// members
|
|
||||||
|
|
||||||
const llama_hparams & hparams;
|
|
||||||
|
|
||||||
callbacks cbs;
|
|
||||||
|
|
||||||
bool has_shift = false;
|
|
||||||
bool do_defrag = false;
|
|
||||||
|
|
||||||
// TODO: remove this and implement llama_kv_cache_recurrent instead
|
|
||||||
bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
|
|
||||||
|
|
||||||
bool v_trans = true; // the value tensor is transposed
|
|
||||||
bool can_shift = false;
|
|
||||||
|
|
||||||
// Note: The value of head isn't only used to optimize searching
|
// Note: The value of head isn't only used to optimize searching
|
||||||
// for a free KV slot. llama_decode_impl also uses it, so it
|
// for a free KV slot. llama_decode_impl also uses it, so it
|
||||||
@ -179,18 +353,41 @@ public:
|
|||||||
// computed before each graph build
|
// computed before each graph build
|
||||||
uint32_t n = 0;
|
uint32_t n = 0;
|
||||||
|
|
||||||
std::vector<llama_kv_cell> cells;
|
std::vector<kv_cell> cells;
|
||||||
|
|
||||||
std::vector<ggml_tensor *> k_l; // per layer
|
std::vector<ggml_tensor *> k_l; // per layer
|
||||||
std::vector<ggml_tensor *> v_l;
|
std::vector<ggml_tensor *> v_l;
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
//const llama_model & model;
|
||||||
|
const llama_hparams & hparams;
|
||||||
|
|
||||||
|
// commit/restore cache
|
||||||
|
// TODO: rework for recurrent cache
|
||||||
|
struct slot_range {
|
||||||
|
uint32_t c0 = 0; // note: these are cell indices, not sequence positions
|
||||||
|
uint32_t c1 = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
// pending cell updates that are not yet committed
|
||||||
|
struct {
|
||||||
|
std::vector<slot_range> ranges;
|
||||||
|
} pending;
|
||||||
|
|
||||||
ggml_type type_k = GGML_TYPE_F16;
|
ggml_type type_k = GGML_TYPE_F16;
|
||||||
ggml_type type_v = GGML_TYPE_F16;
|
ggml_type type_v = GGML_TYPE_F16;
|
||||||
|
|
||||||
std::vector<ggml_context_ptr> ctxs;
|
std::vector<ggml_context_ptr> ctxs;
|
||||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||||
|
|
||||||
|
// find how many cells are currently in use
|
||||||
|
uint32_t cell_max() const;
|
||||||
|
|
||||||
|
size_t total_size() const;
|
||||||
|
|
||||||
|
size_t size_k_bytes() const;
|
||||||
|
size_t size_v_bytes() const;
|
||||||
|
|
||||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||||
|
|
||||||
@ -198,11 +395,6 @@ private:
|
|||||||
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
|
|
||||||
//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
|
|
||||||
//public:
|
|
||||||
// using llama_kv_cache_unified::llama_kv_cache_unified;
|
|
||||||
//};
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// kv cache view
|
// kv cache view
|
||||||
|
@ -2,12 +2,22 @@
|
|||||||
|
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
|
struct llama_memory_params {
|
||||||
|
// kv cache
|
||||||
|
ggml_type type_k;
|
||||||
|
ggml_type type_v;
|
||||||
|
|
||||||
|
// parameters for other types of memory
|
||||||
|
// ...
|
||||||
|
};
|
||||||
|
|
||||||
// general concept of LLM memory
|
// general concept of LLM memory
|
||||||
// the KV cache is a type of LLM memory, but there can be other types
|
// the KV cache is a type of LLM memory, but there can be other types
|
||||||
class llama_memory_i {
|
class llama_memory_i {
|
||||||
public:
|
public:
|
||||||
|
virtual ~llama_memory_i() = default;
|
||||||
|
|
||||||
virtual void clear() = 0;
|
virtual void clear() = 0;
|
||||||
virtual void defrag() = 0;
|
|
||||||
|
|
||||||
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;
|
||||||
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
|
||||||
|
@ -301,12 +301,12 @@ namespace GGUFMeta {
|
|||||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||||
|
|
||||||
switch (arr_info.gt) {
|
switch (arr_info.gt) {
|
||||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
case GGUF_TYPE_UINT32:
|
||||||
case GGUF_TYPE_INT32: GGML_ASSERT(
|
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
||||||
(std::is_same<T, int32_t>::value) ||
|
|
||||||
(std::is_same<T, uint32_t>::value)); break;
|
(std::is_same<T, uint32_t>::value)); break;
|
||||||
|
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
result.resize(arr_info.length);
|
result.resize(arr_info.length);
|
||||||
@ -330,12 +330,12 @@ namespace GGUFMeta {
|
|||||||
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(meta.get(), kid);
|
||||||
|
|
||||||
switch (arr_info.gt) {
|
switch (arr_info.gt) {
|
||||||
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
case GGUF_TYPE_UINT32:
|
||||||
case GGUF_TYPE_INT32: GGML_ASSERT(
|
case GGUF_TYPE_INT32: GGML_ASSERT((std::is_same<T, int32_t>::value) ||
|
||||||
(std::is_same<T, int32_t>::value) ||
|
|
||||||
(std::is_same<T, uint32_t>::value)); break;
|
(std::is_same<T, uint32_t>::value)); break;
|
||||||
|
case GGUF_TYPE_FLOAT32: GGML_ASSERT((std::is_same<T, float>::value)); break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error(format("%s is not a float32, int32 array", key.c_str()));
|
throw std::runtime_error(format("%s is not a float32/uint32/int32 array", key.c_str()));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (arr_info.length > N_MAX) {
|
if (arr_info.length > N_MAX) {
|
||||||
@ -823,6 +823,10 @@ void llama_model_loader::init_mappings(bool prefetch, llama_mlocks * mlock_mmaps
|
|||||||
mmaps_used.reserve(files.size());
|
mmaps_used.reserve(files.size());
|
||||||
for (const auto & file : files) {
|
for (const auto & file : files) {
|
||||||
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
auto * reg = ggml_backend_dev_backend_reg(ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU));
|
||||||
|
if (!reg) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
|
|
||||||
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
auto * is_numa_fn = (decltype(ggml_is_numa) *) ggml_backend_reg_get_proc_address(reg, "ggml_backend_cpu_is_numa");
|
||||||
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
std::unique_ptr<llama_mmap> mapping = std::make_unique<llama_mmap>(file.get(), prefetch ? -1 : 0, is_numa_fn());
|
||||||
mmaps_used.emplace_back(mapping->size(), 0);
|
mmaps_used.emplace_back(mapping->size(), 0);
|
||||||
|
281
examples/talk-llama/llama-model-saver.cpp
Normal file
281
examples/talk-llama/llama-model-saver.cpp
Normal file
@ -0,0 +1,281 @@
|
|||||||
|
#include "llama-model-saver.h"
|
||||||
|
|
||||||
|
#include "gguf.h"
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
#include "llama-hparams.h"
|
||||||
|
#include "llama-model.h"
|
||||||
|
#include "llama-vocab.h"
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
llama_model_saver::llama_model_saver(const struct llama_model & model) : model(model), llm_kv(model.arch) {
|
||||||
|
gguf_ctx = gguf_init_empty();
|
||||||
|
}
|
||||||
|
|
||||||
|
llama_model_saver::~llama_model_saver() {
|
||||||
|
gguf_free(gguf_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const uint32_t value) {
|
||||||
|
gguf_set_val_u32(gguf_ctx, llm_kv(key).c_str(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const int32_t value) {
|
||||||
|
gguf_set_val_i32(gguf_ctx, llm_kv(key).c_str(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const float value) {
|
||||||
|
gguf_set_val_f32(gguf_ctx, llm_kv(key).c_str(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const bool value) {
|
||||||
|
gguf_set_val_bool(gguf_ctx, llm_kv(key).c_str(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const char * value) {
|
||||||
|
gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), value);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[noreturn]]
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const char value) {
|
||||||
|
GGML_UNUSED(key);
|
||||||
|
GGML_UNUSED(value);
|
||||||
|
GGML_ABORT("fatal error"); // this should never be called, only needed to make the template below compile
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename Container>
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const Container & value, const bool per_layer) {
|
||||||
|
const size_t n_values = per_layer ? size_t(model.hparams.n_layer) : value.size();
|
||||||
|
GGML_ASSERT(n_values <= value.size());
|
||||||
|
|
||||||
|
if (n_values == 0) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (per_layer) {
|
||||||
|
bool all_values_the_same = true;
|
||||||
|
for (size_t i = 1; i < n_values; ++i) {
|
||||||
|
if (value[i] != value[0]) {
|
||||||
|
all_values_the_same = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (all_values_the_same) {
|
||||||
|
add_kv(key, value[0]);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (std::is_same<typename Container::value_type, uint8_t>::value) {
|
||||||
|
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT8, value.data(), n_values);
|
||||||
|
} else if (std::is_same<typename Container::value_type, int8_t>::value) {
|
||||||
|
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT8, value.data(), n_values);
|
||||||
|
} else if (std::is_same<typename Container::value_type, uint32_t>::value) {
|
||||||
|
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_UINT32, value.data(), n_values);
|
||||||
|
} else if (std::is_same<typename Container::value_type, int32_t>::value) {
|
||||||
|
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_INT32, value.data(), n_values);
|
||||||
|
} else if (std::is_same<typename Container::value_type, float>::value) {
|
||||||
|
gguf_set_arr_data(gguf_ctx, llm_kv(key).c_str(), GGUF_TYPE_FLOAT32, value.data(), n_values);
|
||||||
|
} else if (std::is_same<Container, std::string>::value) {
|
||||||
|
gguf_set_val_str(gguf_ctx, llm_kv(key).c_str(), reinterpret_cast<const char *>(value.data()));
|
||||||
|
} else {
|
||||||
|
GGML_ABORT("fatal error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv(const enum llm_kv key, const std::vector<std::string> & value) {
|
||||||
|
std::vector<const char *> tmp(value.size());
|
||||||
|
for (size_t i = 0; i < value.size(); ++i) {
|
||||||
|
tmp[i] = value[i].c_str();
|
||||||
|
}
|
||||||
|
gguf_set_arr_str(gguf_ctx, llm_kv(key).c_str(), tmp.data(), tmp.size());
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_tensor(const struct ggml_tensor * tensor) {
|
||||||
|
if (!tensor) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (gguf_find_tensor(gguf_ctx, tensor->name) >= 0) {
|
||||||
|
GGML_ASSERT(std::string(tensor->name) == "rope_freqs.weight"); // FIXME
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
gguf_add_tensor(gguf_ctx, tensor);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_kv_from_model() {
|
||||||
|
const llama_hparams & hparams = model.hparams;
|
||||||
|
const llama_vocab & vocab = model.vocab;
|
||||||
|
|
||||||
|
const int32_t n_vocab = vocab.n_tokens();
|
||||||
|
std::vector<std::string> tokens(n_vocab);
|
||||||
|
std::vector<float> scores(n_vocab);
|
||||||
|
std::vector<int32_t> token_types(n_vocab);
|
||||||
|
|
||||||
|
for (int32_t id = 0; id < n_vocab; ++id) {
|
||||||
|
const llama_vocab::token_data & token_data = vocab.get_token_data(id);
|
||||||
|
|
||||||
|
tokens[id] = token_data.text;
|
||||||
|
scores[id] = token_data.score;
|
||||||
|
|
||||||
|
switch(token_data.attr) {
|
||||||
|
case LLAMA_TOKEN_ATTR_UNKNOWN: token_types[id] = LLAMA_TOKEN_TYPE_UNKNOWN; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_UNUSED: token_types[id] = LLAMA_TOKEN_TYPE_UNUSED; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_NORMAL: token_types[id] = LLAMA_TOKEN_TYPE_NORMAL; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_CONTROL: token_types[id] = LLAMA_TOKEN_TYPE_CONTROL; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_USER_DEFINED: token_types[id] = LLAMA_TOKEN_TYPE_USER_DEFINED; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_BYTE: token_types[id] = LLAMA_TOKEN_TYPE_BYTE; break;
|
||||||
|
case LLAMA_TOKEN_ATTR_UNDEFINED:
|
||||||
|
default: token_types[id] = LLAMA_TOKEN_TYPE_UNDEFINED; break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// add_kv(LLM_KV_GENERAL_TYPE, ???);
|
||||||
|
add_kv(LLM_KV_GENERAL_ARCHITECTURE, model.arch_name());
|
||||||
|
// add_kv(LLM_KV_GENERAL_QUANTIZATION_VERSION, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_ALIGNMENT, ???);
|
||||||
|
add_kv(LLM_KV_GENERAL_NAME, model.name);
|
||||||
|
// add_kv(LLM_KV_GENERAL_AUTHOR, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_VERSION, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_URL, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_DESCRIPTION, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_LICENSE, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SOURCE_URL, ???);
|
||||||
|
// add_kv(LLM_KV_GENERAL_SOURCE_HF_REPO, ???);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_VOCAB_SIZE, vocab.n_tokens());
|
||||||
|
add_kv(LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
||||||
|
add_kv(LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
||||||
|
add_kv(LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
||||||
|
add_kv(LLM_KV_LEADING_DENSE_BLOCK_COUNT, hparams.n_layer_dense_lead);
|
||||||
|
add_kv(LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff_arr, true);
|
||||||
|
add_kv(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
|
add_kv(LLM_KV_EXPERT_SHARED_FEED_FORWARD_LENGTH, hparams.n_ff_exp);
|
||||||
|
add_kv(LLM_KV_USE_PARALLEL_RESIDUAL, hparams.use_par_res);
|
||||||
|
// add_kv(LLM_KV_TENSOR_DATA_LAYOUT, ???);
|
||||||
|
add_kv(LLM_KV_EXPERT_COUNT, hparams.n_expert);
|
||||||
|
add_kv(LLM_KV_EXPERT_USED_COUNT, hparams.n_expert_used);
|
||||||
|
add_kv(LLM_KV_EXPERT_SHARED_COUNT, hparams.n_expert_shared);
|
||||||
|
add_kv(LLM_KV_EXPERT_WEIGHTS_SCALE, hparams.expert_weights_scale);
|
||||||
|
add_kv(LLM_KV_POOLING_TYPE, uint32_t(hparams.pooling_type));
|
||||||
|
add_kv(LLM_KV_LOGIT_SCALE, hparams.f_logit_scale);
|
||||||
|
add_kv(LLM_KV_DECODER_START_TOKEN_ID, hparams.dec_start_token_id);
|
||||||
|
add_kv(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping);
|
||||||
|
add_kv(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping);
|
||||||
|
add_kv(LLM_KV_SWIN_NORM, hparams.swin_norm);
|
||||||
|
add_kv(LLM_KV_RESCALE_EVERY_N_LAYERS, hparams.rescale_every_n_layers);
|
||||||
|
add_kv(LLM_KV_TIME_MIX_EXTRA_DIM, hparams.time_mix_extra_dim);
|
||||||
|
add_kv(LLM_KV_TIME_DECAY_EXTRA_DIM, hparams.time_decay_extra_dim);
|
||||||
|
add_kv(LLM_KV_RESIDUAL_SCALE, hparams.f_residual_scale);
|
||||||
|
add_kv(LLM_KV_EMBEDDING_SCALE, hparams.f_embedding_scale);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head_arr, true);
|
||||||
|
add_kv(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv_arr, true);
|
||||||
|
add_kv(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
||||||
|
add_kv(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv);
|
||||||
|
add_kv(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k);
|
||||||
|
add_kv(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v);
|
||||||
|
add_kv(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
||||||
|
add_kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
|
add_kv(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
||||||
|
add_kv(LLM_KV_ATTENTION_Q_LORA_RANK, hparams.n_lora_q);
|
||||||
|
add_kv(LLM_KV_ATTENTION_KV_LORA_RANK, hparams.n_lora_kv);
|
||||||
|
add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, hparams.n_rel_attn_bkts);
|
||||||
|
add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa);
|
||||||
|
add_kv(LLM_KV_ATTENTION_SCALE, hparams.f_attention_scale);
|
||||||
|
|
||||||
|
const float rope_scaling_factor = hparams.rope_freq_scale_train == 1.0f ? 0.0f : 1.0f/hparams.rope_freq_scale_train;
|
||||||
|
|
||||||
|
add_kv(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot);
|
||||||
|
add_kv(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train);
|
||||||
|
// add_kv(LLM_KV_ROPE_SCALE_LINEAR, rope_scaling_factor); // old name
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_TYPE, llama_rope_scaling_type_name(hparams.rope_scaling_type_train));
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_FACTOR, rope_scaling_factor);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_ATTN_FACTOR, hparams.rope_attn_factor);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_ctx_orig_yarn);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_FINETUNED, hparams.rope_finetuned);
|
||||||
|
add_kv(LLM_KV_ROPE_SCALING_YARN_LOG_MUL, hparams.rope_yarn_log_mul);
|
||||||
|
|
||||||
|
// TODO: implement split file support
|
||||||
|
// add_kv(LLM_KV_SPLIT_NO, ???);
|
||||||
|
// add_kv(LLM_KV_SPLIT_COUNT, ???);
|
||||||
|
// add_kv(LLM_KV_SPLIT_TENSORS_COUNT, ???);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_SSM_INNER_SIZE, hparams.ssm_d_inner);
|
||||||
|
add_kv(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
|
||||||
|
add_kv(LLM_KV_SSM_STATE_SIZE, hparams.ssm_d_state);
|
||||||
|
add_kv(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
|
||||||
|
add_kv(LLM_KV_SSM_DT_B_C_RMS, hparams.ssm_dt_b_c_rms);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_WKV_HEAD_SIZE, hparams.wkv_head_size);
|
||||||
|
|
||||||
|
add_kv(LLM_KV_TOKENIZER_MODEL, vocab.get_tokenizer_model());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_PRE, vocab.get_tokenizer_pre());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_LIST, tokens);
|
||||||
|
add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE, token_types);
|
||||||
|
add_kv(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, vocab.n_token_types());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_SCORES, scores);
|
||||||
|
add_kv(LLM_KV_TOKENIZER_MERGES, vocab.get_bpe_merges());
|
||||||
|
// FIXME llama_token is type i32 but when reading in a GGUF file u32 is expected, not an issue for writing though
|
||||||
|
add_kv(LLM_KV_TOKENIZER_BOS_ID, uint32_t(vocab.token_bos()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_EOS_ID, uint32_t(vocab.token_eos()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_EOT_ID, uint32_t(vocab.token_eot()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_EOM_ID, uint32_t(vocab.token_eom()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_UNK_ID, uint32_t(vocab.token_unk()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_SEP_ID, uint32_t(vocab.token_sep()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_PAD_ID, uint32_t(vocab.token_pad()));
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_CLS_ID, uint32_t(vocab.token_bos())); // deprecated
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_MASK_ID, ???);
|
||||||
|
add_kv(LLM_KV_TOKENIZER_ADD_BOS, vocab.get_add_bos());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_ADD_EOS, vocab.get_add_eos());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_ADD_PREFIX, vocab.get_add_space_prefix());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS, vocab.get_remove_extra_whitespaces());
|
||||||
|
add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, vocab.get_precompiled_charsmap());
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_HF_JSON, ???);
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_RWKV, ???);
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_PRE_ID, uint32_t(vocab.token_fim_pre()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_SUF_ID, uint32_t(vocab.token_fim_suf()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_MID_ID, uint32_t(vocab.token_fim_mid()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_PAD_ID, uint32_t(vocab.token_fim_pad()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_REP_ID, uint32_t(vocab.token_fim_rep()));
|
||||||
|
add_kv(LLM_KV_TOKENIZER_FIM_SEP_ID, uint32_t(vocab.token_fim_sep()));
|
||||||
|
|
||||||
|
// TODO: implement LoRA support
|
||||||
|
// add_kv(LLM_KV_ADAPTER_TYPE, ???);
|
||||||
|
// add_kv(LLM_KV_ADAPTER_LORA_ALPHA, ???);
|
||||||
|
|
||||||
|
// deprecated
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_PREFIX_ID, ???);
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_SUFFIX_ID, ???);
|
||||||
|
// add_kv(LLM_KV_TOKENIZER_MIDDLE_ID, ???);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::add_tensors_from_model() {
|
||||||
|
if (std::string(model.output->name) != std::string(model.tok_embd->name)) {
|
||||||
|
add_tensor(model.tok_embd); // some models use the same tensor for tok_embd and output
|
||||||
|
}
|
||||||
|
add_tensor(model.type_embd);
|
||||||
|
add_tensor(model.pos_embd);
|
||||||
|
add_tensor(model.tok_norm);
|
||||||
|
add_tensor(model.tok_norm_b);
|
||||||
|
add_tensor(model.output_norm);
|
||||||
|
add_tensor(model.output_norm_b);
|
||||||
|
add_tensor(model.output);
|
||||||
|
add_tensor(model.output_b);
|
||||||
|
add_tensor(model.output_norm_enc);
|
||||||
|
add_tensor(model.cls);
|
||||||
|
add_tensor(model.cls_b);
|
||||||
|
add_tensor(model.cls_out);
|
||||||
|
add_tensor(model.cls_out_b);
|
||||||
|
|
||||||
|
for (const struct llama_layer & layer : model.layers) {
|
||||||
|
for (size_t i = 0; i < sizeof(layer)/sizeof(struct ggml_tensor *); ++i) {
|
||||||
|
add_tensor(reinterpret_cast<const struct ggml_tensor * const *>(&layer)[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void llama_model_saver::save(const std::string & path_model) {
|
||||||
|
gguf_write_to_file(gguf_ctx, path_model.c_str(), false);
|
||||||
|
}
|
||||||
|
|
37
examples/talk-llama/llama-model-saver.h
Normal file
37
examples/talk-llama/llama-model-saver.h
Normal file
@ -0,0 +1,37 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "llama.h"
|
||||||
|
#include "llama-arch.h"
|
||||||
|
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
struct llama_model_saver {
|
||||||
|
struct gguf_context * gguf_ctx = nullptr;
|
||||||
|
const struct llama_model & model;
|
||||||
|
const struct LLM_KV llm_kv;
|
||||||
|
|
||||||
|
llama_model_saver(const struct llama_model & model);
|
||||||
|
~llama_model_saver();
|
||||||
|
|
||||||
|
void add_kv(enum llm_kv key, uint32_t value);
|
||||||
|
void add_kv(enum llm_kv key, int32_t value);
|
||||||
|
void add_kv(enum llm_kv key, float value);
|
||||||
|
void add_kv(enum llm_kv key, bool value);
|
||||||
|
void add_kv(enum llm_kv key, const char * value);
|
||||||
|
|
||||||
|
[[noreturn]]
|
||||||
|
void add_kv(enum llm_kv key, char value); // needed to make the template below compile
|
||||||
|
|
||||||
|
template <typename Container>
|
||||||
|
void add_kv(enum llm_kv key, const Container & value, bool per_layer = false);
|
||||||
|
|
||||||
|
void add_kv(enum llm_kv key, const std::vector<std::string> & value);
|
||||||
|
|
||||||
|
void add_tensor(const struct ggml_tensor * tensor);
|
||||||
|
|
||||||
|
void add_kv_from_model();
|
||||||
|
|
||||||
|
void add_tensors_from_model();
|
||||||
|
|
||||||
|
void save(const std::string & path_model);
|
||||||
|
};
|
@ -80,6 +80,7 @@ const char * llm_type_name(llm_type type) {
|
|||||||
case LLM_TYPE_236B: return "236B";
|
case LLM_TYPE_236B: return "236B";
|
||||||
case LLM_TYPE_290B: return "290B";
|
case LLM_TYPE_290B: return "290B";
|
||||||
case LLM_TYPE_314B: return "314B";
|
case LLM_TYPE_314B: return "314B";
|
||||||
|
case LLM_TYPE_405B: return "405B";
|
||||||
case LLM_TYPE_671B: return "671B";
|
case LLM_TYPE_671B: return "671B";
|
||||||
case LLM_TYPE_SMALL: return "0.1B";
|
case LLM_TYPE_SMALL: return "0.1B";
|
||||||
case LLM_TYPE_MEDIUM: return "0.4B";
|
case LLM_TYPE_MEDIUM: return "0.4B";
|
||||||
@ -116,6 +117,10 @@ static const std::map<llama_rope_scaling_type, const char *> LLAMA_ROPE_SCALING_
|
|||||||
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
{ LLAMA_ROPE_SCALING_TYPE_LONGROPE, "longrope" },
|
||||||
};
|
};
|
||||||
|
|
||||||
|
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type) {
|
||||||
|
return LLAMA_ROPE_SCALING_TYPES.at(rope_scaling_type);
|
||||||
|
}
|
||||||
|
|
||||||
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
static llama_rope_scaling_type llama_rope_scaling_type_from_string(const std::string & name) {
|
||||||
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
for (const auto & kv : LLAMA_ROPE_SCALING_TYPES) {
|
||||||
if (kv.second == name) {
|
if (kv.second == name) {
|
||||||
@ -298,6 +303,10 @@ static buft_list_t make_cpu_buft_list(const std::vector<ggml_backend_dev_t> & de
|
|||||||
// add extra buffer types, only if no GPU device is present
|
// add extra buffer types, only if no GPU device is present
|
||||||
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
// ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (cpu_dev == nullptr) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
|
|
||||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||||
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
||||||
@ -582,6 +591,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 32: type = LLM_TYPE_7B; break;
|
case 32: type = LLM_TYPE_7B; break;
|
||||||
case 80: type = LLM_TYPE_70B; break;
|
case 80: type = LLM_TYPE_70B; break;
|
||||||
|
case 162: type = LLM_TYPE_405B; break;
|
||||||
default: type = LLM_TYPE_UNKNOWN;
|
default: type = LLM_TYPE_UNKNOWN;
|
||||||
}
|
}
|
||||||
} break;
|
} break;
|
||||||
@ -773,6 +783,7 @@ void llama_model::load_hparams(llama_model_loader & ml) {
|
|||||||
// fall through
|
// fall through
|
||||||
case LLM_ARCH_QWEN2:
|
case LLM_ARCH_QWEN2:
|
||||||
{
|
{
|
||||||
|
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type, false);
|
||||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||||
switch (hparams.n_layer) {
|
switch (hparams.n_layer) {
|
||||||
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
case 24: type = hparams.n_embd == 1024 ? LLM_TYPE_0_5B : LLM_TYPE_1B; break;
|
||||||
@ -1481,6 +1492,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
ggml_backend_dev_t cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (cpu_dev == nullptr) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
const int i_gpu_start = std::max((int) hparams.n_layer - n_gpu_layers, (int) 0);
|
||||||
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
const int act_gpu_layers = devices.empty() ? 0 : std::min(n_gpu_layers, (int)n_layer + 1);
|
||||||
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
auto get_layer_buft_list = [&](int il) -> llama_model::impl::layer_dev {
|
||||||
@ -1648,8 +1662,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) {
|
||||||
std::regex pattern(overrides->pattern);
|
std::regex pattern(overrides->pattern);
|
||||||
if (std::regex_search(tensor_name, pattern)) {
|
if (std::regex_search(tensor_name, pattern)) {
|
||||||
LLAMA_LOG_DEBUG("tensor %s buffer type overriden to %s\n", tensor_name.c_str(), ggml_backend_buft_name(overrides->buft));
|
|
||||||
buft = overrides->buft;
|
buft = overrides->buft;
|
||||||
|
LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n",
|
||||||
|
tensor_name.c_str(),
|
||||||
|
ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type),
|
||||||
|
ggml_backend_buft_name(buft));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1666,6 +1683,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
||||||
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (!cpu_dev) {
|
||||||
|
throw std::runtime_error("no CPU backend found");
|
||||||
|
}
|
||||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1847,7 +1867,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
|
||||||
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
layer.bo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
if (n_ff > 0) {
|
||||||
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
|
||||||
|
}
|
||||||
|
|
||||||
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
if (hparams.rope_scaling_type_train == LLAMA_ROPE_SCALING_TYPE_LONGROPE) {
|
||||||
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
layer.rope_long = create_tensor(tn(LLM_TENSOR_ROPE_FACTORS_LONG, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||||
@ -1857,9 +1879,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
layer.rope_freqs = create_tensor(tn(LLM_TENSOR_ROPE_FREQS, "weight", i), {n_rot/2}, TENSOR_NOT_REQUIRED | (i != 0 ? TENSOR_DUPLICATED : 0));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (n_ff > 0) {
|
||||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||||
|
}
|
||||||
|
|
||||||
// optional MLP bias
|
// optional MLP bias
|
||||||
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
layer.ffn_gate_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), {n_ff}, TENSOR_NOT_REQUIRED);
|
||||||
@ -3503,7 +3527,11 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
|
|
||||||
// output
|
// output
|
||||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (output == NULL) {
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = layers[i];
|
auto & layer = layers[i];
|
||||||
@ -4108,6 +4136,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
if (!dev) {
|
if (!dev) {
|
||||||
// FIXME: workaround for CPU backend buft having a NULL device
|
// FIXME: workaround for CPU backend buft having a NULL device
|
||||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||||
|
if (!dev) {
|
||||||
|
throw std::runtime_error(format("%s: no CPU backend found", __func__));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
ggml_backend_dev_props props;
|
ggml_backend_dev_props props;
|
||||||
ggml_backend_dev_get_props(dev, &props);
|
ggml_backend_dev_get_props(dev, &props);
|
||||||
@ -4237,7 +4268,7 @@ uint64_t llama_model::n_elements() const {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void llama_model::print_info() const {
|
void llama_model::print_info() const {
|
||||||
const char * rope_scaling_type = LLAMA_ROPE_SCALING_TYPES.at(hparams.rope_scaling_type_train);
|
const std::string rope_scaling_type = llama_rope_scaling_type_name(hparams.rope_scaling_type_train);
|
||||||
|
|
||||||
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
auto print_f = [](const std::function<uint32_t(uint32_t)> & f, uint32_t n) {
|
||||||
bool is_var = false;
|
bool is_var = false;
|
||||||
@ -4298,7 +4329,7 @@ void llama_model::print_info() const {
|
|||||||
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
LLAMA_LOG_INFO("%s: causal attn = %d\n", __func__, hparams.causal_attn);
|
||||||
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
||||||
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
||||||
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type.c_str());
|
||||||
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
||||||
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
||||||
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
LLAMA_LOG_INFO("%s: n_ctx_orig_yarn = %u\n", __func__, hparams.n_ctx_orig_yarn);
|
||||||
@ -4445,6 +4476,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
|
|||||||
return it->second;
|
return it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
|
||||||
|
// choose long/short freq factors based on the context size
|
||||||
|
if (layers[il].rope_freqs != nullptr) {
|
||||||
|
return layers[il].rope_freqs;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
||||||
|
return layers[il].rope_long;
|
||||||
|
}
|
||||||
|
|
||||||
|
return layers[il].rope_short;
|
||||||
|
}
|
||||||
|
|
||||||
struct llm_build_llama : public llm_graph_context {
|
struct llm_build_llama : public llm_graph_context {
|
||||||
llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
|
||||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||||
@ -4485,7 +4529,7 @@ struct llm_build_llama : public llm_graph_context {
|
|||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
@ -4691,6 +4735,7 @@ struct llm_build_deci : public llm_graph_context {
|
|||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
const int64_t n_head_kv = hparams.n_head_kv(il);
|
const int64_t n_head_kv = hparams.n_head_kv(il);
|
||||||
const int64_t n_head = hparams.n_head(il);
|
const int64_t n_head = hparams.n_head(il);
|
||||||
|
const int64_t n_ff = hparams.n_ff(il);
|
||||||
|
|
||||||
if (n_head == 0) {
|
if (n_head == 0) {
|
||||||
// attention-free layer of Llama-3_1-Nemotron-51B
|
// attention-free layer of Llama-3_1-Nemotron-51B
|
||||||
@ -4710,7 +4755,7 @@ struct llm_build_deci : public llm_graph_context {
|
|||||||
} else if (n_head > 0) {
|
} else if (n_head > 0) {
|
||||||
// self-attention
|
// self-attention
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
@ -4766,6 +4811,11 @@ struct llm_build_deci : public llm_graph_context {
|
|||||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// FFN-free layer of Llama-3_1-Nemotron-Ultra-253B
|
||||||
|
if (n_ff == 0) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// For Granite architecture
|
// For Granite architecture
|
||||||
if (hparams.f_residual_scale) {
|
if (hparams.f_residual_scale) {
|
||||||
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
cur = ggml_scale(ctx0, cur, hparams.f_residual_scale);
|
||||||
@ -7192,7 +7242,7 @@ struct llm_build_phi3 : public llm_graph_context {
|
|||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for 128k context
|
// rope freq factors for 128k context
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
ggml_tensor* attn_norm_output = build_norm(inpL,
|
ggml_tensor* attn_norm_output = build_norm(inpL,
|
||||||
model.layers[il].attn_norm,
|
model.layers[il].attn_norm,
|
||||||
@ -7944,7 +7994,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
|
|||||||
for (int il = 0; il < n_layer; ++il) {
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
ggml_tensor * inpSA = inpL;
|
ggml_tensor * inpSA = inpL;
|
||||||
|
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// norm
|
// norm
|
||||||
cur = build_norm(inpL,
|
cur = build_norm(inpL,
|
||||||
@ -8711,7 +8761,7 @@ struct llm_build_mamba : public llm_graph_context {
|
|||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto kv_head = kv_self->head;
|
const auto kv_head = kv_self->head;
|
||||||
|
|
||||||
@ -9012,7 +9062,7 @@ struct llm_build_cohere2 : public llm_graph_context {
|
|||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for 128k context
|
// rope freq factors for 128k context
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
@ -9950,7 +10000,7 @@ struct llm_build_deepseek : public llm_graph_context {
|
|||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
@ -11314,7 +11364,7 @@ struct llm_build_exaone : public llm_graph_context {
|
|||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
@ -11459,7 +11509,7 @@ struct llm_build_rwkv6_base : public llm_graph_context {
|
|||||||
ggml_tensor * state_mask,
|
ggml_tensor * state_mask,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto n_tokens = ubatch.n_tokens;
|
const auto n_tokens = ubatch.n_tokens;
|
||||||
const auto n_seqs = ubatch.n_seqs;
|
const auto n_seqs = ubatch.n_seqs;
|
||||||
@ -11855,7 +11905,7 @@ struct llm_build_rwkv7_base : public llm_graph_context {
|
|||||||
ggml_tensor *& first_layer_value,
|
ggml_tensor *& first_layer_value,
|
||||||
const llama_ubatch & ubatch,
|
const llama_ubatch & ubatch,
|
||||||
int il) const {
|
int il) const {
|
||||||
const llama_kv_cache_unified * kv_self = static_cast<const llama_kv_cache_unified *>(memory);
|
const llama_kv_cache_recurrent * kv_self = static_cast<const llama_kv_cache_recurrent *>(memory);
|
||||||
|
|
||||||
const auto n_tokens = ubatch.n_tokens;
|
const auto n_tokens = ubatch.n_tokens;
|
||||||
const auto n_seqs = ubatch.n_seqs;
|
const auto n_seqs = ubatch.n_seqs;
|
||||||
@ -12695,7 +12745,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
// rope freq factors for llama3; may return nullptr for llama2 and other models
|
||||||
ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs.get_rope_factors(n_ctx_per_seq, il);
|
ggml_tensor * rope_factors = model.get_rope_factors(n_ctx_per_seq, il);
|
||||||
|
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
|
||||||
@ -12815,36 +12865,46 @@ struct llm_build_bailingmoe : public llm_graph_context {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
llama_memory_i * llama_model::create_memory() const {
|
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
|
||||||
llama_memory_i * res;
|
llama_memory_i * res;
|
||||||
|
|
||||||
switch (arch) {
|
switch (arch) {
|
||||||
|
case LLM_ARCH_BERT:
|
||||||
|
case LLM_ARCH_JINA_BERT_V2:
|
||||||
|
case LLM_ARCH_NOMIC_BERT:
|
||||||
|
case LLM_ARCH_NOMIC_BERT_MOE:
|
||||||
|
{
|
||||||
|
res = nullptr;
|
||||||
|
} break;
|
||||||
case LLM_ARCH_MAMBA:
|
case LLM_ARCH_MAMBA:
|
||||||
case LLM_ARCH_RWKV6:
|
case LLM_ARCH_RWKV6:
|
||||||
case LLM_ARCH_RWKV6QWEN2:
|
case LLM_ARCH_RWKV6QWEN2:
|
||||||
case LLM_ARCH_RWKV7:
|
case LLM_ARCH_RWKV7:
|
||||||
case LLM_ARCH_ARWKV7:
|
case LLM_ARCH_ARWKV7:
|
||||||
{
|
{
|
||||||
res = new llama_kv_cache_unified(hparams, {
|
res = new llama_kv_cache_recurrent(
|
||||||
/*.get_rope_factors =*/ nullptr
|
*this,
|
||||||
});
|
GGML_TYPE_F32,
|
||||||
|
GGML_TYPE_F32,
|
||||||
|
cparams.offload_kqv,
|
||||||
|
std::max((uint32_t) 1, cparams.n_seq_max));
|
||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
res = new llama_kv_cache_unified(hparams, {
|
const auto padding = llama_kv_cache_unified::get_padding(cparams);
|
||||||
/*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
|
|
||||||
// choose long/short freq factors based on the context size
|
|
||||||
if (layers[il].rope_freqs != nullptr) {
|
|
||||||
return layers[il].rope_freqs;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
|
cparams.n_ctx = GGML_PAD(cparams.n_ctx, padding);
|
||||||
return layers[il].rope_long;
|
|
||||||
}
|
|
||||||
|
|
||||||
return layers[il].rope_short;
|
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
|
||||||
}
|
|
||||||
});
|
res = new llama_kv_cache_unified(
|
||||||
|
*this,
|
||||||
|
params.type_k,
|
||||||
|
params.type_v,
|
||||||
|
!cparams.flash_attn,
|
||||||
|
cparams.offload_kqv,
|
||||||
|
cparams.n_ctx,
|
||||||
|
padding);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -13226,8 +13286,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||||||
case LLM_ARCH_DECI:
|
case LLM_ARCH_DECI:
|
||||||
case LLM_ARCH_BAICHUAN:
|
case LLM_ARCH_BAICHUAN:
|
||||||
case LLM_ARCH_STARCODER:
|
case LLM_ARCH_STARCODER:
|
||||||
case LLM_ARCH_PLAMO:
|
|
||||||
case LLM_ARCH_ORION:
|
|
||||||
case LLM_ARCH_INTERNLM2:
|
case LLM_ARCH_INTERNLM2:
|
||||||
case LLM_ARCH_MINICPM:
|
case LLM_ARCH_MINICPM:
|
||||||
case LLM_ARCH_XVERSE:
|
case LLM_ARCH_XVERSE:
|
||||||
@ -13265,6 +13323,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||||||
case LLM_ARCH_PHI2:
|
case LLM_ARCH_PHI2:
|
||||||
case LLM_ARCH_PHI3:
|
case LLM_ARCH_PHI3:
|
||||||
case LLM_ARCH_PHIMOE:
|
case LLM_ARCH_PHIMOE:
|
||||||
|
case LLM_ARCH_PLAMO:
|
||||||
case LLM_ARCH_GEMMA:
|
case LLM_ARCH_GEMMA:
|
||||||
case LLM_ARCH_GEMMA2:
|
case LLM_ARCH_GEMMA2:
|
||||||
case LLM_ARCH_GEMMA3:
|
case LLM_ARCH_GEMMA3:
|
||||||
@ -13272,6 +13331,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
|
|||||||
case LLM_ARCH_OPENELM:
|
case LLM_ARCH_OPENELM:
|
||||||
case LLM_ARCH_GPTNEOX:
|
case LLM_ARCH_GPTNEOX:
|
||||||
case LLM_ARCH_CODESHELL:
|
case LLM_ARCH_CODESHELL:
|
||||||
|
case LLM_ARCH_ORION:
|
||||||
case LLM_ARCH_NEMOTRON:
|
case LLM_ARCH_NEMOTRON:
|
||||||
case LLM_ARCH_EXAONE:
|
case LLM_ARCH_EXAONE:
|
||||||
case LLM_ARCH_MINICPM3:
|
case LLM_ARCH_MINICPM3:
|
||||||
@ -13344,6 +13404,14 @@ const char * llama_model_chat_template(const llama_model * model, const char * n
|
|||||||
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
: LLM_KV(model->arch)(LLM_KV_TOKENIZER_CHAT_TEMPLATE);
|
||||||
const auto & it = model->gguf_kv.find(key);
|
const auto & it = model->gguf_kv.find(key);
|
||||||
if (it == model->gguf_kv.end()) {
|
if (it == model->gguf_kv.end()) {
|
||||||
|
// one-off fix for very popular models (so we are not flooded with issues)
|
||||||
|
// do not extend this list unless absolutely necessary
|
||||||
|
// Mistral-Small-2503 does not have built-in chat template
|
||||||
|
llama_vocab_pre_type pre_type = model->vocab.get_pre_type();
|
||||||
|
if (pre_type == LLAMA_VOCAB_PRE_TYPE_TEKKEN && model->layers.size() == 40) {
|
||||||
|
return "mistral-v7-tekken";
|
||||||
|
}
|
||||||
|
|
||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -76,6 +76,7 @@ enum llm_type {
|
|||||||
LLM_TYPE_236B,
|
LLM_TYPE_236B,
|
||||||
LLM_TYPE_290B,
|
LLM_TYPE_290B,
|
||||||
LLM_TYPE_314B,
|
LLM_TYPE_314B,
|
||||||
|
LLM_TYPE_405B,
|
||||||
LLM_TYPE_671B,
|
LLM_TYPE_671B,
|
||||||
LLM_TYPE_SMALL,
|
LLM_TYPE_SMALL,
|
||||||
LLM_TYPE_MEDIUM,
|
LLM_TYPE_MEDIUM,
|
||||||
@ -95,6 +96,8 @@ enum llm_type {
|
|||||||
LLM_TYPE_235B_A22B,
|
LLM_TYPE_235B_A22B,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
std::string llama_rope_scaling_type_name(llama_rope_scaling_type rope_scaling_type);
|
||||||
|
|
||||||
struct llama_layer_posnet {
|
struct llama_layer_posnet {
|
||||||
// resnet
|
// resnet
|
||||||
struct ggml_tensor * norm1 = nullptr;
|
struct ggml_tensor * norm1 = nullptr;
|
||||||
@ -395,8 +398,11 @@ struct llama_model {
|
|||||||
|
|
||||||
const struct ggml_tensor * get_tensor(const char * name) const;
|
const struct ggml_tensor * get_tensor(const char * name) const;
|
||||||
|
|
||||||
|
ggml_tensor * get_rope_factors(uint32_t n_ctx_per_seq, int il) const;
|
||||||
|
|
||||||
|
// note: can mutate `cparams`
|
||||||
// TODO: move this to new llm_arch_model_i interface
|
// TODO: move this to new llm_arch_model_i interface
|
||||||
llama_memory_i * create_memory() const; // TODO: params
|
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
||||||
|
|
||||||
// TODO: move this to new llm_arch_model_i interface
|
// TODO: move this to new llm_arch_model_i interface
|
||||||
llm_graph_result_ptr build_graph(
|
llm_graph_result_ptr build_graph(
|
||||||
|
@ -519,7 +519,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||||||
nthread = std::thread::hardware_concurrency();
|
nthread = std::thread::hardware_concurrency();
|
||||||
}
|
}
|
||||||
|
|
||||||
// mmap consistently increases speed Linux, and also increases speed on Windows with
|
// mmap consistently increases speed on Linux, and also increases speed on Windows with
|
||||||
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
// hot cache. It may cause a slowdown on macOS, possibly related to free memory.
|
||||||
#if defined(__linux__) || defined(_WIN32)
|
#if defined(__linux__) || defined(_WIN32)
|
||||||
constexpr bool use_mmap = true;
|
constexpr bool use_mmap = true;
|
||||||
@ -529,7 +529,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
|||||||
|
|
||||||
llama_model_kv_override * kv_overrides = nullptr;
|
llama_model_kv_override * kv_overrides = nullptr;
|
||||||
if (params->kv_overrides) {
|
if (params->kv_overrides) {
|
||||||
auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
|
||||||
kv_overrides = v->data();
|
kv_overrides = v->data();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1750,23 +1750,35 @@ static const char * llama_sampler_top_n_sigma_name(const struct llama_sampler *
|
|||||||
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
static void llama_sampler_top_n_sigma_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
||||||
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
const auto * ctx = (llama_sampler_top_n_sigma *) smpl->ctx;
|
||||||
|
|
||||||
|
if (ctx->n <= 0.0f || cur_p->size <= 1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// find max logit and calculate mean
|
// find max logit and calculate mean
|
||||||
float max = cur_p->data[0].logit;
|
float max = cur_p->data[0].logit;
|
||||||
float logits_sum = 0;
|
float logits_sum = 0;
|
||||||
|
size_t valid_count = 0;
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
// Only count non-negative infinity values
|
||||||
|
if (cur_p->data[i].logit != -INFINITY) {
|
||||||
if (cur_p->data[i].logit > max) {
|
if (cur_p->data[i].logit > max) {
|
||||||
max = cur_p->data[i].logit;
|
max = cur_p->data[i].logit;
|
||||||
}
|
}
|
||||||
logits_sum += cur_p->data[i].logit;
|
logits_sum += cur_p->data[i].logit;
|
||||||
|
valid_count++;
|
||||||
}
|
}
|
||||||
float mean = logits_sum/cur_p->size;
|
}
|
||||||
|
float mean = valid_count > 0 ? logits_sum/valid_count : 0;
|
||||||
|
|
||||||
// calculate standard deviation
|
// calculate standard deviation
|
||||||
float acc = 0;
|
float acc = 0;
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
// Skip -infinity in std calculation
|
||||||
|
if (cur_p->data[i].logit != -INFINITY) {
|
||||||
acc += pow(cur_p->data[i].logit - mean, 2);
|
acc += pow(cur_p->data[i].logit - mean, 2);
|
||||||
}
|
}
|
||||||
float std = sqrt(acc/cur_p->size);
|
}
|
||||||
|
float std = valid_count > 0 ? sqrt(acc/valid_count) : 0;
|
||||||
|
|
||||||
//apply mask
|
//apply mask
|
||||||
for (size_t i = 0; i < cur_p->size; ++i) {
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "gguf.h"
|
||||||
#include "llama-impl.h"
|
#include "llama-impl.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
|
|
||||||
@ -415,6 +417,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|||||||
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
"'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_SEED_CODER:
|
||||||
|
regex_exprs = {
|
||||||
|
// original regex from tokenizer.json
|
||||||
|
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\r\n]+|\\s*[\r\n]+|\\s+(?!\\S)|\\s+"
|
||||||
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1}| ?[^\\s\\p{L}\\p{N}\\r\\n]+|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
};
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
regex_exprs = {
|
regex_exprs = {
|
||||||
@ -1227,6 +1236,9 @@ struct fragment_buffer_variant {
|
|||||||
struct llama_vocab::impl {
|
struct llama_vocab::impl {
|
||||||
uint32_t n_token_types = 0; // for BERT-style token types
|
uint32_t n_token_types = 0; // for BERT-style token types
|
||||||
|
|
||||||
|
std::string tokenizer_model;
|
||||||
|
std::string tokenizer_pre;
|
||||||
|
|
||||||
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
|
||||||
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
|
|
||||||
@ -1362,9 +1374,6 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
|
|
||||||
// determine vocab type
|
// determine vocab type
|
||||||
{
|
{
|
||||||
std::string tokenizer_model;
|
|
||||||
std::string tokenizer_pre;
|
|
||||||
|
|
||||||
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_model);
|
||||||
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
|
||||||
|
|
||||||
@ -1459,7 +1468,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
|
|
||||||
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
const int precompiled_charsmap_keyidx = gguf_find_key(ctx, kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP).c_str());
|
||||||
if (precompiled_charsmap_keyidx != -1) {
|
if (precompiled_charsmap_keyidx != -1) {
|
||||||
size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
const gguf_type pc_type = gguf_get_arr_type(ctx, precompiled_charsmap_keyidx);
|
||||||
|
GGML_ASSERT(pc_type == GGUF_TYPE_INT8 || pc_type == GGUF_TYPE_UINT8);
|
||||||
|
|
||||||
|
const size_t n_precompiled_charsmap = gguf_get_arr_n(ctx, precompiled_charsmap_keyidx);
|
||||||
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
const char * pc = (const char *) gguf_get_arr_data(ctx, precompiled_charsmap_keyidx);
|
||||||
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
precompiled_charsmap.assign(pc, pc + n_precompiled_charsmap);
|
||||||
#ifdef IS_BIG_ENDIAN
|
#ifdef IS_BIG_ENDIAN
|
||||||
@ -1634,6 +1646,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
tokenizer_pre == "bailingmoe") {
|
tokenizer_pre == "bailingmoe") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
|
||||||
clean_spaces = false;
|
clean_spaces = false;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "seed-coder") {
|
||||||
|
pre_type = LLAMA_VOCAB_PRE_TYPE_SEED_CODER;
|
||||||
|
clean_spaces = false;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
@ -2778,6 +2794,14 @@ void llama_vocab::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
pimpl->load(ml, kv);
|
pimpl->load(ml, kv);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::string llama_vocab::get_tokenizer_model() const {
|
||||||
|
return pimpl->tokenizer_model;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string llama_vocab::get_tokenizer_pre() const {
|
||||||
|
return pimpl->tokenizer_pre;
|
||||||
|
}
|
||||||
|
|
||||||
enum llama_vocab_type llama_vocab::get_type() const {
|
enum llama_vocab_type llama_vocab::get_type() const {
|
||||||
return pimpl->type;
|
return pimpl->type;
|
||||||
}
|
}
|
||||||
@ -3000,6 +3024,20 @@ int llama_vocab::find_bpe_rank(const std::string & token_left, const std::string
|
|||||||
return it->second;
|
return it->second;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
std::vector<std::string> llama_vocab::get_bpe_merges() const {
|
||||||
|
std::vector<std::string> result(pimpl->bpe_ranks.size());
|
||||||
|
|
||||||
|
for (const auto & pair : pimpl->bpe_ranks) {
|
||||||
|
result[pair.second] = pair.first.first + " " + pair.first.second;
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<char> llama_vocab::get_precompiled_charsmap() const {
|
||||||
|
return pimpl->precompiled_charsmap;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t llama_vocab::tokenize(
|
int32_t llama_vocab::tokenize(
|
||||||
const char * text,
|
const char * text,
|
||||||
int32_t text_len,
|
int32_t text_len,
|
||||||
|
@ -21,6 +21,9 @@ struct llama_vocab {
|
|||||||
|
|
||||||
void load(llama_model_loader & ml, const LLM_KV & kv);
|
void load(llama_model_loader & ml, const LLM_KV & kv);
|
||||||
|
|
||||||
|
std::string get_tokenizer_model() const;
|
||||||
|
std::string get_tokenizer_pre() const;
|
||||||
|
|
||||||
enum llama_vocab_type get_type() const;
|
enum llama_vocab_type get_type() const;
|
||||||
enum llama_vocab_pre_type get_pre_type() const;
|
enum llama_vocab_pre_type get_pre_type() const;
|
||||||
|
|
||||||
@ -80,6 +83,9 @@ struct llama_vocab {
|
|||||||
int max_token_len() const;
|
int max_token_len() const;
|
||||||
|
|
||||||
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
|
||||||
|
std::vector<std::string> get_bpe_merges() const;
|
||||||
|
|
||||||
|
std::vector<char> get_precompiled_charsmap() const;
|
||||||
|
|
||||||
int32_t tokenize(
|
int32_t tokenize(
|
||||||
const char * text,
|
const char * text,
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include "llama-mmap.h"
|
#include "llama-mmap.h"
|
||||||
#include "llama-vocab.h"
|
#include "llama-vocab.h"
|
||||||
#include "llama-model-loader.h"
|
#include "llama-model-loader.h"
|
||||||
|
#include "llama-model-saver.h"
|
||||||
#include "llama-model.h"
|
#include "llama-model.h"
|
||||||
|
|
||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
@ -16,6 +17,10 @@
|
|||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <ctime>
|
#include <ctime>
|
||||||
|
|
||||||
|
#if defined(_MSC_VER)
|
||||||
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// interface implementation
|
// interface implementation
|
||||||
//
|
//
|
||||||
@ -249,6 +254,13 @@ struct llama_model * llama_model_load_from_splits(
|
|||||||
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
return llama_model_load_from_file_impl(splits.front(), splits, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void llama_model_save_to_file(const struct llama_model * model, const char * path_model) {
|
||||||
|
llama_model_saver ms(*model);
|
||||||
|
ms.add_kv_from_model();
|
||||||
|
ms.add_tensors_from_model();
|
||||||
|
ms.save(path_model);
|
||||||
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
// chat templates
|
// chat templates
|
||||||
//
|
//
|
||||||
@ -334,3 +346,4 @@ const char * llama_print_system_info(void) {
|
|||||||
|
|
||||||
return s.c_str();
|
return s.c_str();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -4,6 +4,7 @@
|
|||||||
#include "ggml.h"
|
#include "ggml.h"
|
||||||
#include "ggml-cpu.h"
|
#include "ggml-cpu.h"
|
||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
|
#include "ggml-opt.h"
|
||||||
|
|
||||||
#include <stddef.h>
|
#include <stddef.h>
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
@ -112,6 +113,7 @@ extern "C" {
|
|||||||
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
|
||||||
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
|
||||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||||
};
|
};
|
||||||
|
|
||||||
enum llama_rope_type {
|
enum llama_rope_type {
|
||||||
@ -343,7 +345,7 @@ extern "C" {
|
|||||||
float yarn_beta_fast; // YaRN low correction dim
|
float yarn_beta_fast; // YaRN low correction dim
|
||||||
float yarn_beta_slow; // YaRN high correction dim
|
float yarn_beta_slow; // YaRN high correction dim
|
||||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default)
|
float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
|
||||||
|
|
||||||
ggml_backend_sched_eval_callback cb_eval;
|
ggml_backend_sched_eval_callback cb_eval;
|
||||||
void * cb_eval_user_data;
|
void * cb_eval_user_data;
|
||||||
@ -351,19 +353,18 @@ extern "C" {
|
|||||||
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||||
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||||
|
|
||||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
|
||||||
// TODO: move at the end of the struct
|
|
||||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
|
||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
|
||||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
|
||||||
bool no_perf; // whether to measure performance timings
|
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
// currently works only with CPU execution
|
// currently works only with CPU execution
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
|
|
||||||
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||||
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||||
|
bool no_perf; // whether to measure performance timings
|
||||||
|
bool op_offload; // whether to offload host tensor operations to device
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
@ -445,6 +446,10 @@ extern "C" {
|
|||||||
size_t n_paths,
|
size_t n_paths,
|
||||||
struct llama_model_params params);
|
struct llama_model_params params);
|
||||||
|
|
||||||
|
LLAMA_API void llama_model_save_to_file(
|
||||||
|
const struct llama_model * model,
|
||||||
|
const char * path_model);
|
||||||
|
|
||||||
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
DEPRECATED(LLAMA_API void llama_free_model(struct llama_model * model),
|
||||||
"use llama_model_free instead");
|
"use llama_model_free instead");
|
||||||
|
|
||||||
@ -924,14 +929,19 @@ extern "C" {
|
|||||||
// Frees a batch of tokens allocated with llama_batch_init()
|
// Frees a batch of tokens allocated with llama_batch_init()
|
||||||
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
LLAMA_API void llama_batch_free(struct llama_batch batch);
|
||||||
|
|
||||||
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
|
// Process a batch of tokens.
|
||||||
// Stores the encoder output internally for later use by the decoder cross-attention layers.
|
// In contrast to llama_decode() - this call does not use KV cache.
|
||||||
|
// For encode-decoder contexts, processes the batch using the encoder.
|
||||||
|
// Can store the encoder output internally for later use by the decoder's cross-attention layers.
|
||||||
// 0 - success
|
// 0 - success
|
||||||
// < 0 - error. the KV cache state is restored to the state before this call
|
// < 0 - error. the KV cache state is restored to the state before this call
|
||||||
LLAMA_API int32_t llama_encode(
|
LLAMA_API int32_t llama_encode(
|
||||||
struct llama_context * ctx,
|
struct llama_context * ctx,
|
||||||
struct llama_batch batch);
|
struct llama_batch batch);
|
||||||
|
|
||||||
|
// Process a batch of tokens.
|
||||||
|
// Requires KV cache.
|
||||||
|
// For encode-decoder contexts, processes the batch using the decoder.
|
||||||
// Positive return values does not mean a fatal error, but rather a warning.
|
// Positive return values does not mean a fatal error, but rather a warning.
|
||||||
// 0 - success
|
// 0 - success
|
||||||
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
|
||||||
@ -1428,6 +1438,37 @@ extern "C" {
|
|||||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||||
|
|
||||||
|
//
|
||||||
|
// training
|
||||||
|
//
|
||||||
|
|
||||||
|
// function that returns whether or not a given tensor contains trainable parameters
|
||||||
|
typedef bool (*llama_opt_param_filter)(const struct ggml_tensor * tensor, void * userdata);
|
||||||
|
|
||||||
|
// always returns true
|
||||||
|
LLAMA_API bool llama_opt_param_filter_all(const struct ggml_tensor * tensor, void * userdata);
|
||||||
|
|
||||||
|
struct llama_opt_params {
|
||||||
|
uint32_t n_ctx_train; // assumed context size post training, use context size specified in llama_context if 0
|
||||||
|
|
||||||
|
llama_opt_param_filter param_filter; // callback for determining which tensors contain trainable parameters
|
||||||
|
void * param_filter_ud; // userdata for determining which tensors contain trainable parameters
|
||||||
|
|
||||||
|
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||||
|
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||||
|
};
|
||||||
|
|
||||||
|
LLAMA_API void llama_opt_init(struct llama_context * lctx, struct llama_model * model, struct llama_opt_params lopt_params);
|
||||||
|
|
||||||
|
LLAMA_API void llama_opt_epoch(
|
||||||
|
struct llama_context * lctx,
|
||||||
|
ggml_opt_dataset_t dataset,
|
||||||
|
ggml_opt_result_t result_train,
|
||||||
|
ggml_opt_result_t result_eval,
|
||||||
|
int64_t idata_split,
|
||||||
|
ggml_opt_epoch_callback callback_train,
|
||||||
|
ggml_opt_epoch_callback callback_eval);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
Loading…
Reference in New Issue
Block a user