mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-08-09 20:58:42 +02:00
talk-llama : sync llama.cpp
This commit is contained in:
@ -34,6 +34,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_PHI3, "phi3" },
|
||||
{ LLM_ARCH_PHIMOE, "phimoe" },
|
||||
{ LLM_ARCH_PLAMO, "plamo" },
|
||||
{ LLM_ARCH_PLAMO2, "plamo2" },
|
||||
{ LLM_ARCH_CODESHELL, "codeshell" },
|
||||
{ LLM_ARCH_ORION, "orion" },
|
||||
{ LLM_ARCH_INTERNLM2, "internlm2" },
|
||||
@ -67,6 +68,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_JAIS, "jais" },
|
||||
{ LLM_ARCH_NEMOTRON, "nemotron" },
|
||||
{ LLM_ARCH_EXAONE, "exaone" },
|
||||
{ LLM_ARCH_EXAONE4, "exaone4" },
|
||||
{ LLM_ARCH_RWKV6, "rwkv6" },
|
||||
{ LLM_ARCH_RWKV6QWEN2, "rwkv6qwen2" },
|
||||
{ LLM_ARCH_RWKV7, "rwkv7" },
|
||||
@ -81,9 +83,11 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_DOTS1, "dots1" },
|
||||
{ LLM_ARCH_ARCEE, "arcee" },
|
||||
{ LLM_ARCH_ERNIE4_5, "ernie4_5" },
|
||||
{ LLM_ARCH_ERNIE4_5_MOE, "ernie4_5-moe" },
|
||||
{ LLM_ARCH_HUNYUAN_MOE, "hunyuan-moe" },
|
||||
{ LLM_ARCH_SMOLLM3, "smollm3" },
|
||||
{ LLM_ARCH_LFM2, "lfm2" },
|
||||
{ LLM_ARCH_DREAM, "dream" },
|
||||
{ LLM_ARCH_UNKNOWN, "(unknown)" },
|
||||
};
|
||||
|
||||
@ -784,6 +788,36 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_PLAMO2,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_SSM_IN, "blk.%d.ssm_in" },
|
||||
{ LLM_TENSOR_SSM_CONV1D, "blk.%d.ssm_conv1d" },
|
||||
{ LLM_TENSOR_SSM_X, "blk.%d.ssm_x" },
|
||||
{ LLM_TENSOR_SSM_DT, "blk.%d.ssm_dt" },
|
||||
{ LLM_TENSOR_SSM_A, "blk.%d.ssm_a" },
|
||||
{ LLM_TENSOR_SSM_D, "blk.%d.ssm_d" },
|
||||
{ LLM_TENSOR_SSM_OUT, "blk.%d.ssm_out" },
|
||||
{ LLM_TENSOR_SSM_DT_NORM, "blk.%d.ssm_dt_norm" },
|
||||
{ LLM_TENSOR_SSM_B_NORM, "blk.%d.ssm_b_norm" },
|
||||
{ LLM_TENSOR_SSM_C_NORM, "blk.%d.ssm_c_norm" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_CODESHELL,
|
||||
{
|
||||
@ -1477,6 +1511,26 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_EXAONE4,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
}
|
||||
},
|
||||
{
|
||||
LLM_ARCH_RWKV6,
|
||||
{
|
||||
@ -1793,6 +1847,31 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_ERNIE4_5_MOE,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
{ LLM_TENSOR_FFN_GATE_INP, "blk.%d.ffn_gate_inp" },
|
||||
{ LLM_TENSOR_FFN_GATE_SHEXP, "blk.%d.ffn_gate_shexp" },
|
||||
{ LLM_TENSOR_FFN_DOWN_SHEXP, "blk.%d.ffn_down_shexp" },
|
||||
{ LLM_TENSOR_FFN_UP_SHEXP, "blk.%d.ffn_up_shexp" },
|
||||
{ LLM_TENSOR_FFN_GATE_EXPS, "blk.%d.ffn_gate_exps" },
|
||||
{ LLM_TENSOR_FFN_DOWN_EXPS, "blk.%d.ffn_down_exps" },
|
||||
{ LLM_TENSOR_FFN_UP_EXPS, "blk.%d.ffn_up_exps" },
|
||||
{ LLM_TENSOR_FFN_EXP_PROBS_B, "blk.%d.exp_probs_b" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_HUNYUAN_MOE,
|
||||
{
|
||||
@ -1854,6 +1933,23 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_TOKEN_EMBD_NORM, "token_embd_norm" },
|
||||
}
|
||||
},
|
||||
{
|
||||
LLM_ARCH_DREAM,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_UNKNOWN,
|
||||
{
|
||||
@ -2094,6 +2190,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_JAMBA:
|
||||
case LLM_ARCH_FALCON_H1:
|
||||
case LLM_ARCH_PLAMO2:
|
||||
case LLM_ARCH_GRANITE_HYBRID:
|
||||
case LLM_ARCH_LFM2:
|
||||
return true;
|
||||
@ -2101,3 +2198,12 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_arch_is_diffusion(const llm_arch & arch) {
|
||||
switch (arch) {
|
||||
case LLM_ARCH_DREAM:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
@ -38,6 +38,7 @@ enum llm_arch {
|
||||
LLM_ARCH_PHI3,
|
||||
LLM_ARCH_PHIMOE,
|
||||
LLM_ARCH_PLAMO,
|
||||
LLM_ARCH_PLAMO2,
|
||||
LLM_ARCH_CODESHELL,
|
||||
LLM_ARCH_ORION,
|
||||
LLM_ARCH_INTERNLM2,
|
||||
@ -71,6 +72,7 @@ enum llm_arch {
|
||||
LLM_ARCH_JAIS,
|
||||
LLM_ARCH_NEMOTRON,
|
||||
LLM_ARCH_EXAONE,
|
||||
LLM_ARCH_EXAONE4,
|
||||
LLM_ARCH_RWKV6,
|
||||
LLM_ARCH_RWKV6QWEN2,
|
||||
LLM_ARCH_RWKV7,
|
||||
@ -85,9 +87,11 @@ enum llm_arch {
|
||||
LLM_ARCH_DOTS1,
|
||||
LLM_ARCH_ARCEE,
|
||||
LLM_ARCH_ERNIE4_5,
|
||||
LLM_ARCH_ERNIE4_5_MOE,
|
||||
LLM_ARCH_HUNYUAN_MOE,
|
||||
LLM_ARCH_SMOLLM3,
|
||||
LLM_ARCH_LFM2,
|
||||
LLM_ARCH_DREAM,
|
||||
LLM_ARCH_UNKNOWN,
|
||||
};
|
||||
|
||||
@ -478,3 +482,4 @@ const llm_tensor_info & llm_tensor_info_for(llm_tensor tensor);
|
||||
|
||||
bool llm_arch_is_recurrent(const llm_arch & arch);
|
||||
bool llm_arch_is_hybrid (const llm_arch & arch);
|
||||
bool llm_arch_is_diffusion(const llm_arch & arch);
|
||||
|
@ -27,6 +27,7 @@ bool llama_batch_allocr::init(
|
||||
const llama_vocab & vocab,
|
||||
const llama_memory_i * memory,
|
||||
uint32_t n_embd,
|
||||
uint32_t n_seq_max,
|
||||
bool output_all) {
|
||||
clear();
|
||||
|
||||
@ -40,6 +41,11 @@ bool llama_batch_allocr::init(
|
||||
// validate input batch
|
||||
//
|
||||
|
||||
if (n_seq_max > LLAMA_MAX_SEQ) {
|
||||
LLAMA_LOG_ERROR("%s: n_seq_max = %d > %d\n", __func__, n_seq_max, LLAMA_MAX_SEQ);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (batch.token) {
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= vocab.n_tokens()) {
|
||||
@ -52,8 +58,8 @@ bool llama_batch_allocr::init(
|
||||
if (batch.seq_id) {
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
for (int32_t s = 0; s < batch.n_seq_id[i]; ++s) {
|
||||
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= LLAMA_MAX_SEQ)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], LLAMA_MAX_SEQ);
|
||||
if (batch.seq_id && (batch.seq_id[i][s] < 0 || batch.seq_id[i][s] >= (llama_seq_id) n_seq_max)) {
|
||||
LLAMA_LOG_ERROR("%s: invalid seq_id[%d][%d] = %d > %d\n", __func__, i, s, batch.seq_id[i][s], (llama_seq_id) n_seq_max);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@ -86,7 +92,7 @@ bool llama_batch_allocr::init(
|
||||
|
||||
// initialize the starting position for each sequence based on the positions in the memory
|
||||
llama_pos p0[LLAMA_MAX_SEQ];
|
||||
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
if (!memory) {
|
||||
// if no memory -> start from 0
|
||||
p0[s] = 0;
|
||||
@ -144,12 +150,15 @@ bool llama_batch_allocr::init(
|
||||
//
|
||||
|
||||
this->n_embd = n_embd;
|
||||
this->n_seq_max = n_seq_max;
|
||||
|
||||
// count the outputs in this batch
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
n_outputs += batch.logits[i] != 0;
|
||||
}
|
||||
|
||||
has_cpl = false;
|
||||
|
||||
// determine coupled sequences
|
||||
// these are pairs of sequences that have at least one token in the input batch that is assigned to both of them
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i) {
|
||||
@ -189,7 +198,7 @@ bool llama_batch_allocr::init(
|
||||
seq_set_map[cur].push_back(i);
|
||||
}
|
||||
|
||||
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
if (seq_set_unq.test(s)) {
|
||||
seq_idx[s] = seq_id_unq.size();
|
||||
seq_id_unq.push_back(s);
|
||||
@ -201,7 +210,7 @@ bool llama_batch_allocr::init(
|
||||
LLAMA_LOG_DEBUG("%s: input batch info:\n", __func__);
|
||||
|
||||
llama_ubatch ubatch {
|
||||
/*.equal_seqs =*/ false,
|
||||
/*.b_equal_seqs =*/ false,
|
||||
/*.n_tokens =*/ (uint32_t) batch.n_tokens,
|
||||
/*.n_seq_tokens =*/ (uint32_t) 1,
|
||||
/*.n_seqs =*/ (uint32_t) batch.n_tokens,
|
||||
@ -214,6 +223,7 @@ bool llama_batch_allocr::init(
|
||||
/*.seq_id_unq =*/ this->seq_id_unq.data(),
|
||||
/*.seq_idx =*/ this->seq_idx.data(),
|
||||
/*.output =*/ batch.logits,
|
||||
/*.data =*/ {},
|
||||
};
|
||||
|
||||
ubatch_print(ubatch, debug);
|
||||
@ -241,7 +251,7 @@ bool llama_batch_allocr::init(
|
||||
// consistency checks
|
||||
//
|
||||
|
||||
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
if (seq_pos[s].empty()) {
|
||||
continue;
|
||||
}
|
||||
@ -284,8 +294,8 @@ bool llama_batch_allocr::init(
|
||||
}
|
||||
|
||||
if (memory) {
|
||||
for (int32_t s0 = 0; s0 < LLAMA_MAX_SEQ; ++s0) {
|
||||
for (int32_t s1 = 0; s1 < LLAMA_MAX_SEQ; ++s1) {
|
||||
for (uint32_t s0 = 0; s0 < n_seq_max; ++s0) {
|
||||
for (uint32_t s1 = 0; s1 < n_seq_max; ++s1) {
|
||||
if (seq_cpl[s0][s1]) {
|
||||
if (memory->seq_pos_min(s0) != memory->seq_pos_min(s1) ||
|
||||
memory->seq_pos_max(s0) != memory->seq_pos_max(s1)) {
|
||||
@ -316,12 +326,12 @@ bool llama_batch_allocr::init(
|
||||
//
|
||||
{
|
||||
seq_set_t cur_seq_set[LLAMA_MAX_SEQ];
|
||||
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
cur_seq_set[s].set();
|
||||
}
|
||||
|
||||
llama_pos cur_seq_pos[LLAMA_MAX_SEQ];
|
||||
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
cur_seq_pos[s] = -1;
|
||||
}
|
||||
|
||||
@ -357,39 +367,38 @@ llama_ubatch llama_batch_allocr::ubatch_reserve(uint32_t n_seq_tokens, uint32_t
|
||||
clear();
|
||||
split_reset();
|
||||
|
||||
ubatches.emplace_back();
|
||||
auto udata = std::make_shared<llama_ubatch::data_t>();
|
||||
|
||||
auto & ubatch = ubatches.back();
|
||||
|
||||
ubatch.token .resize(n_tokens);
|
||||
ubatch.embd .clear();
|
||||
ubatch.pos .resize(n_tokens);
|
||||
ubatch.n_seq_id .resize(n_tokens);
|
||||
ubatch.seq_id .resize(n_tokens);
|
||||
ubatch.seq_id_unq.resize(0);
|
||||
ubatch.seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
||||
ubatch.output .resize(n_tokens);
|
||||
udata->token .resize(n_tokens);
|
||||
udata->embd .clear();
|
||||
udata->pos .resize(n_tokens);
|
||||
udata->n_seq_id .resize(n_tokens);
|
||||
udata->seq_id .resize(n_tokens);
|
||||
udata->seq_id_unq.resize(0);
|
||||
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
||||
udata->output .resize(n_tokens);
|
||||
|
||||
for (uint32_t s = 0; s < n_seqs; ++s) {
|
||||
ubatch.seq_idx[s] = s;
|
||||
ubatch.seq_id_unq.push_back(s);
|
||||
udata->seq_idx[s] = s;
|
||||
udata->seq_id_unq.push_back(s);
|
||||
}
|
||||
|
||||
llama_ubatch res {
|
||||
/*.equal_seqs =*/ true,
|
||||
/*.b_equal_seqs =*/ true,
|
||||
/*.n_tokens =*/ n_tokens,
|
||||
/*.n_seq_tokens =*/ n_seq_tokens,
|
||||
/*.n_seqs =*/ n_seqs,
|
||||
/*.n_seqs_unq =*/ n_seqs,
|
||||
|
||||
/*.token =*/ ubatch.token.data(),
|
||||
/*.token =*/ udata->token.data(),
|
||||
/*.embd =*/ nullptr,
|
||||
/*.pos =*/ ubatch.pos.data(),
|
||||
/*.n_seq_id =*/ ubatch.n_seq_id.data(),
|
||||
/*.seq_id =*/ ubatch.seq_id.data(),
|
||||
/*.seq_id_unq =*/ ubatch.seq_id_unq.data(),
|
||||
/*.seq_idx =*/ ubatch.seq_idx.data(),
|
||||
/*.output =*/ ubatch.output.data(),
|
||||
/*.pos =*/ udata->pos.data(),
|
||||
/*.n_seq_id =*/ udata->n_seq_id.data(),
|
||||
/*.seq_id =*/ udata->seq_id.data(),
|
||||
/*.seq_id_unq =*/ udata->seq_id_unq.data(),
|
||||
/*.seq_idx =*/ udata->seq_idx.data(),
|
||||
/*.output =*/ udata->output.data(),
|
||||
/*.data =*/ std::move(udata),
|
||||
};
|
||||
|
||||
return res;
|
||||
@ -430,8 +439,6 @@ void llama_batch_allocr::split_reset() {
|
||||
|
||||
used.clear();
|
||||
used.resize(get_n_tokens(), false);
|
||||
|
||||
ubatches.clear();
|
||||
}
|
||||
|
||||
llama_ubatch llama_batch_allocr::split_simple(uint32_t n_ubatch) {
|
||||
@ -646,78 +653,77 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||
|
||||
assert(n_tokens%n_seqs == 0);
|
||||
|
||||
ubatches.emplace_back();
|
||||
|
||||
auto & ubatch = ubatches.back();
|
||||
auto udata = std::make_shared<llama_ubatch::data_t>();
|
||||
|
||||
const int32_t n_pos_cur = batch.embd ? n_pos_per_embd : 1;
|
||||
|
||||
const int64_t n_embd_all = batch.embd ? (int64_t) n_tokens*n_embd : 0;
|
||||
const int64_t n_pos_all = (int64_t) n_tokens*n_pos_cur;
|
||||
|
||||
ubatch.token .resize(n_tokens);
|
||||
ubatch.embd .resize(n_embd_all);
|
||||
ubatch.pos .resize(n_pos_all);
|
||||
ubatch.n_seq_id .resize(n_tokens);
|
||||
ubatch.seq_id .resize(n_tokens);
|
||||
ubatch.seq_id_unq.resize(0);
|
||||
ubatch.seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
||||
ubatch.output .resize(n_tokens);
|
||||
udata->token .resize(n_tokens);
|
||||
udata->embd .resize(n_embd_all);
|
||||
udata->pos .resize(n_pos_all);
|
||||
udata->n_seq_id .resize(n_tokens);
|
||||
udata->seq_id .resize(n_tokens);
|
||||
udata->seq_id_unq.resize(0);
|
||||
udata->seq_idx .resize(LLAMA_MAX_SEQ, -1);
|
||||
udata->output .resize(n_tokens);
|
||||
|
||||
seq_set_t seq_set_unq;
|
||||
|
||||
for (size_t i = 0; i < idxs.size(); ++i) {
|
||||
if (batch.token) {
|
||||
ubatch.token[i] = batch.token[idxs[i]];
|
||||
udata->token[i] = batch.token[idxs[i]];
|
||||
}
|
||||
|
||||
if (batch.embd) {
|
||||
memcpy(ubatch.embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
|
||||
memcpy(udata->embd.data() + i*n_embd, batch.embd + (int64_t) idxs[i]*n_embd, n_embd*sizeof(float));
|
||||
}
|
||||
|
||||
for (int j = 0; j < n_pos_cur; ++j) {
|
||||
ubatch.pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
|
||||
udata->pos[j*n_tokens + i] = batch.pos[j*batch.n_tokens + idxs[i]];
|
||||
}
|
||||
|
||||
ubatch.n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
||||
ubatch.seq_id[i] = batch.seq_id[idxs[i]];
|
||||
ubatch.output[i] = batch.logits[idxs[i]];
|
||||
udata->n_seq_id[i] = batch.n_seq_id[idxs[i]];
|
||||
udata->seq_id[i] = batch.seq_id[idxs[i]];
|
||||
udata->output[i] = batch.logits[idxs[i]];
|
||||
|
||||
for (int s = 0; s < ubatch.n_seq_id[i]; ++s) {
|
||||
seq_set_unq.set(ubatch.seq_id[i][s]);
|
||||
for (int s = 0; s < udata->n_seq_id[i]; ++s) {
|
||||
seq_set_unq.set(udata->seq_id[i][s]);
|
||||
}
|
||||
|
||||
if (ubatch.output[i]) {
|
||||
if (udata->output[i]) {
|
||||
out_ids.push_back(idxs[i]);
|
||||
}
|
||||
}
|
||||
|
||||
for (int32_t s = 0; s < LLAMA_MAX_SEQ; ++s) {
|
||||
for (uint32_t s = 0; s < n_seq_max; ++s) {
|
||||
if (seq_set_unq.test(s)) {
|
||||
ubatch.seq_idx[s] = ubatch.seq_id_unq.size();
|
||||
ubatch.seq_id_unq.push_back(s);
|
||||
udata->seq_idx[s] = udata->seq_id_unq.size();
|
||||
udata->seq_id_unq.push_back(s);
|
||||
}
|
||||
}
|
||||
|
||||
llama_ubatch res {
|
||||
/*.equal_seqs =*/ equal_seqs,
|
||||
/*.b_equal_seqs =*/ equal_seqs,
|
||||
/*.n_tokens =*/ n_tokens,
|
||||
/*.n_seq_tokens =*/ n_tokens/n_seqs,
|
||||
/*.n_seqs =*/ n_seqs,
|
||||
/*.n_seqs_unq =*/ (uint32_t) ubatch.seq_id_unq.size(),
|
||||
/*.n_seqs_unq =*/ (uint32_t) udata->seq_id_unq.size(),
|
||||
|
||||
/*.token =*/ batch.token ? ubatch.token.data() : nullptr,
|
||||
/*.embd =*/ batch.embd ? ubatch.embd.data() : nullptr,
|
||||
/*.pos =*/ ubatch.pos.data(),
|
||||
/*.n_seq_id =*/ ubatch.n_seq_id.data(),
|
||||
/*.seq_id =*/ ubatch.seq_id.data(),
|
||||
/*.seq_id_unq =*/ ubatch.seq_id_unq.data(),
|
||||
/*.seq_idx =*/ ubatch.seq_idx.data(),
|
||||
/*.output =*/ ubatch.output.data(),
|
||||
/*.token =*/ batch.token ? udata->token.data() : nullptr,
|
||||
/*.embd =*/ batch.embd ? udata->embd.data() : nullptr,
|
||||
/*.pos =*/ udata->pos.data(),
|
||||
/*.n_seq_id =*/ udata->n_seq_id.data(),
|
||||
/*.seq_id =*/ udata->seq_id.data(),
|
||||
/*.seq_id_unq =*/ udata->seq_id_unq.data(),
|
||||
/*.seq_idx =*/ udata->seq_idx.data(),
|
||||
/*.output =*/ udata->output.data(),
|
||||
/*.data =*/ std::move(udata),
|
||||
};
|
||||
|
||||
if (debug > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: added ubatch %d to split:\n", __func__, (int) ubatches.size() - 1);
|
||||
LLAMA_LOG_DEBUG("%s: added ubatch to split:\n", __func__);
|
||||
|
||||
ubatch_print(res, debug);
|
||||
}
|
||||
@ -727,7 +733,7 @@ llama_ubatch llama_batch_allocr::ubatch_add(const std::vector<int32_t> & idxs, u
|
||||
|
||||
void llama_batch_allocr::ubatch_print(const llama_ubatch & ubatch, int debug) {
|
||||
if (debug > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: equal_seqs = %d\n", __func__, ubatch.equal_seqs);
|
||||
LLAMA_LOG_DEBUG("%s: equal_seqs = %d\n", __func__, ubatch.equal_seqs());
|
||||
LLAMA_LOG_DEBUG("%s: n_tokens = %d\n", __func__, ubatch.n_tokens);
|
||||
LLAMA_LOG_DEBUG("%s: n_seq_tokens = %d\n", __func__, ubatch.n_seq_tokens);
|
||||
LLAMA_LOG_DEBUG("%s: n_seqs = %d\n", __func__, ubatch.n_seqs);
|
||||
|
@ -8,12 +8,17 @@
|
||||
#include <vector>
|
||||
#include <set>
|
||||
#include <bitset>
|
||||
#include <memory>
|
||||
#include <unordered_map>
|
||||
|
||||
// keep this struct lightweight
|
||||
// it points to data in `llama_batch_allocr`
|
||||
struct llama_ubatch {
|
||||
bool equal_seqs;
|
||||
bool equal_seqs() const {
|
||||
return b_equal_seqs != 0;
|
||||
}
|
||||
|
||||
uint32_t b_equal_seqs; // note: this is a boolean, but we use an int32_t for alignment
|
||||
// otherwise address sanitizer complains
|
||||
// TODO: whole_seqs for embeddings?
|
||||
|
||||
uint32_t n_tokens; // total tokens (n_seq_tokens * n_seqs)
|
||||
@ -34,6 +39,20 @@ struct llama_ubatch {
|
||||
llama_seq_id * seq_id_unq; // [n_seqs_unq] | s | seq_id
|
||||
int32_t * seq_idx; // [LLAMA_MAX_SEQ] | - | seq_idx
|
||||
int8_t * output; // [n_tokens] | i | -
|
||||
|
||||
struct data_t {
|
||||
std::vector<llama_token> token;
|
||||
std::vector<float> embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id *> seq_id;
|
||||
std::vector<llama_seq_id> seq_id_unq;
|
||||
std::vector<int32_t> seq_idx;
|
||||
std::vector<int8_t> output;
|
||||
};
|
||||
|
||||
// the llama_ubatch pointers above point to this data if set. otherwise - points to non-owning data
|
||||
std::shared_ptr<data_t> data;
|
||||
};
|
||||
|
||||
// a helper for sanitizing, fulfilling and splitting a batch
|
||||
@ -48,6 +67,7 @@ public:
|
||||
const llama_vocab & vocab,
|
||||
const llama_memory_i * memory,
|
||||
uint32_t n_embd,
|
||||
uint32_t n_seq_max,
|
||||
bool output_all);
|
||||
|
||||
const llama_batch & get_batch() const;
|
||||
@ -100,6 +120,7 @@ private:
|
||||
const uint32_t n_pos_per_embd;
|
||||
|
||||
uint32_t n_embd;
|
||||
uint32_t n_seq_max;
|
||||
uint32_t n_outputs;
|
||||
|
||||
std::array<llama_seq_id, 1> seq_id_0 = { 0 }; // default sequence id
|
||||
@ -115,7 +136,7 @@ private:
|
||||
using seq_cpl_t = std::vector<bool>;
|
||||
|
||||
// helper flag to quickly determine if there are any coupled sequences in the batch
|
||||
bool has_cpl;
|
||||
bool has_cpl = false;
|
||||
|
||||
std::vector<pos_set_t> seq_pos; // seq_pos[s]: the set of positions in sequence s
|
||||
std::vector<seq_cpl_t> seq_cpl; // seq_cpl[s0][s1]: if sequence s0 is coupled to sequence s1
|
||||
@ -135,20 +156,5 @@ private:
|
||||
// used[i] indicates if token i has already been used in a previous ubatch
|
||||
std::vector<bool> used;
|
||||
|
||||
// llama_ubatch points to this data:
|
||||
struct ubatch {
|
||||
std::vector<llama_token> token;
|
||||
std::vector<float> embd;
|
||||
std::vector<llama_pos> pos;
|
||||
std::vector<int32_t> n_seq_id;
|
||||
std::vector<llama_seq_id *> seq_id;
|
||||
std::vector<llama_seq_id> seq_id_unq;
|
||||
std::vector<int32_t> seq_idx;
|
||||
std::vector<int8_t> output;
|
||||
};
|
||||
|
||||
// current splitting state:
|
||||
std::vector<ubatch> ubatches;
|
||||
|
||||
int debug;
|
||||
};
|
||||
|
@ -56,6 +56,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "glmedge", LLM_CHAT_TEMPLATE_GLMEDGE },
|
||||
{ "minicpm", LLM_CHAT_TEMPLATE_MINICPM },
|
||||
{ "exaone3", LLM_CHAT_TEMPLATE_EXAONE_3 },
|
||||
{ "exaone4", LLM_CHAT_TEMPLATE_EXAONE_4 },
|
||||
{ "rwkv-world", LLM_CHAT_TEMPLATE_RWKV_WORLD },
|
||||
{ "granite", LLM_CHAT_TEMPLATE_GRANITE },
|
||||
{ "gigachat", LLM_CHAT_TEMPLATE_GIGACHAT },
|
||||
@ -65,6 +66,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
|
||||
{ "llama4", LLM_CHAT_TEMPLATE_LLAMA4 },
|
||||
{ "smolvlm", LLM_CHAT_TEMPLATE_SMOLVLM },
|
||||
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
|
||||
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
|
||||
};
|
||||
|
||||
llm_chat_template llm_chat_template_from_str(const std::string & name) {
|
||||
@ -167,10 +169,13 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
} else if (tmpl_contains(LU8("<|Assistant|>")) && tmpl_contains(LU8("<|User|>")) && tmpl_contains(LU8("<|end▁of▁sentence|>"))) {
|
||||
return LLM_CHAT_TEMPLATE_DEEPSEEK_3;
|
||||
} else if (tmpl_contains("[|system|]") && tmpl_contains("[|assistant|]") && tmpl_contains("[|endofturn|]")) {
|
||||
if (tmpl_contains("[|tool|]")) {
|
||||
return LLM_CHAT_TEMPLATE_EXAONE_4;
|
||||
}
|
||||
// ref: https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct/discussions/8#66bae61b1893d14ee8ed85bb
|
||||
// EXAONE-3.0-7.8B-Instruct
|
||||
return LLM_CHAT_TEMPLATE_EXAONE_3;
|
||||
} else if (tmpl_contains("rwkv-world")) {
|
||||
} else if (tmpl_contains("rwkv-world") || tmpl_contains("{{- 'User: ' + message['content']|trim + '\\n\\n' -}}")) {
|
||||
return LLM_CHAT_TEMPLATE_RWKV_WORLD;
|
||||
} else if (tmpl_contains("<|start_of_role|>")) {
|
||||
return LLM_CHAT_TEMPLATE_GRANITE;
|
||||
@ -188,6 +193,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
|
||||
return LLM_CHAT_TEMPLATE_DOTS1;
|
||||
} else if (tmpl_contains("<|startoftext|>") && tmpl_contains("<|extra_4|>")) {
|
||||
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
|
||||
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
|
||||
return LLM_CHAT_TEMPLATE_KIMI_K2;
|
||||
}
|
||||
return LLM_CHAT_TEMPLATE_UNKNOWN;
|
||||
}
|
||||
@ -529,6 +536,22 @@ int32_t llm_chat_apply_template(
|
||||
if (add_ass) {
|
||||
ss << "[|assistant|]";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_EXAONE_4) {
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
ss << "[|system|]" << trim(message->content) << "[|endofturn|]\n";
|
||||
} else if (role == "user") {
|
||||
ss << "[|user|]" << trim(message->content) << "\n";
|
||||
} else if (role == "assistant") {
|
||||
ss << "[|assistant|]" << trim(message->content) << "[|endofturn|]\n";
|
||||
} else if (role == "tool") {
|
||||
ss << "[|tool|]" << trim(message->content) << "[|endofturn|]\n";
|
||||
}
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "[|assistant|]";
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_RWKV_WORLD) {
|
||||
// this template requires the model to have "\n\n" as EOT token
|
||||
for (size_t i = 0; i < chat.size(); i++) {
|
||||
@ -680,6 +703,25 @@ int32_t llm_chat_apply_template(
|
||||
ss << "<|startoftext|>" << message->content << "<|extra_0|>";
|
||||
}
|
||||
}
|
||||
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
|
||||
// moonshotai/Kimi-K2-Instruct
|
||||
for (auto message : chat) {
|
||||
std::string role(message->role);
|
||||
if (role == "system") {
|
||||
ss << "<|im_system|>system<|im_middle|>";
|
||||
} else if (role == "user") {
|
||||
ss << "<|im_user|>user<|im_middle|>";
|
||||
} else if (role == "assistant") {
|
||||
ss << "<|im_assistant|>assistant<|im_middle|>";
|
||||
} else if (role == "tool") {
|
||||
ss << "<|im_system|>tool<|im_middle|>";
|
||||
}
|
||||
|
||||
ss << message->content << "<|im_end|>";
|
||||
}
|
||||
if (add_ass) {
|
||||
ss << "<|im_assistant|>assistant<|im_middle|>";
|
||||
}
|
||||
} else {
|
||||
// template not supported
|
||||
return -1;
|
||||
|
@ -35,6 +35,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_GLMEDGE,
|
||||
LLM_CHAT_TEMPLATE_MINICPM,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_3,
|
||||
LLM_CHAT_TEMPLATE_EXAONE_4,
|
||||
LLM_CHAT_TEMPLATE_RWKV_WORLD,
|
||||
LLM_CHAT_TEMPLATE_GRANITE,
|
||||
LLM_CHAT_TEMPLATE_GIGACHAT,
|
||||
@ -45,6 +46,7 @@ enum llm_chat_template {
|
||||
LLM_CHAT_TEMPLATE_SMOLVLM,
|
||||
LLM_CHAT_TEMPLATE_DOTS1,
|
||||
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
|
||||
LLM_CHAT_TEMPLATE_KIMI_K2,
|
||||
LLM_CHAT_TEMPLATE_UNKNOWN,
|
||||
};
|
||||
|
||||
|
@ -98,10 +98,20 @@ llama_context::llama_context(
|
||||
LLAMA_LOG_WARN("%s: n_batch is less than GGML_KQ_MASK_PAD - increasing to %d\n", __func__, GGML_KQ_MASK_PAD);
|
||||
cparams.n_batch = GGML_KQ_MASK_PAD;
|
||||
}
|
||||
|
||||
cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch);
|
||||
|
||||
cparams.op_offload = params.op_offload;
|
||||
cparams.kv_unified = params.kv_unified;
|
||||
|
||||
{
|
||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
||||
|
||||
if (!supports_set_rows && !cparams.kv_unified) {
|
||||
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
||||
cparams.kv_unified = true;
|
||||
}
|
||||
}
|
||||
|
||||
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||
|
||||
@ -112,6 +122,7 @@ llama_context::llama_context(
|
||||
LLAMA_LOG_INFO("%s: n_ubatch = %u\n", __func__, cparams.n_ubatch);
|
||||
LLAMA_LOG_INFO("%s: causal_attn = %d\n", __func__, cparams.causal_attn);
|
||||
LLAMA_LOG_INFO("%s: flash_attn = %d\n", __func__, cparams.flash_attn);
|
||||
LLAMA_LOG_INFO("%s: kv_unified = %s\n", __func__, cparams.kv_unified ? "true" : "false");
|
||||
LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base);
|
||||
LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale);
|
||||
|
||||
@ -227,8 +238,8 @@ llama_context::llama_context(
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: max_nodes = %zu\n", __func__, max_nodes);
|
||||
|
||||
// buffer used to store the computation graph and the tensor meta data
|
||||
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
||||
gf_res_prev.reset(new llm_graph_result(max_nodes));
|
||||
gf_res_reserve.reset(new llm_graph_result(max_nodes));
|
||||
|
||||
// TODO: move these checks to ggml_backend_sched
|
||||
// enabling pipeline parallelism in the scheduler increases memory usage, so it is only done when necessary
|
||||
@ -267,7 +278,7 @@ llama_context::llama_context(
|
||||
|
||||
// reserve worst-case graph
|
||||
if (!hparams.vocab_only && memory) {
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: worst-case: n_tokens = %d, n_seqs = %d, n_outputs = %d\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
@ -287,7 +298,7 @@ llama_context::llama_context(
|
||||
|
||||
cross.v_embd.clear();
|
||||
|
||||
// reserve pp graph first so that buffers are only allocated once
|
||||
// reserve pp (prompt processing) graph first so that buffers are only allocated once
|
||||
{
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
if (!gf) {
|
||||
@ -298,9 +309,9 @@ llama_context::llama_context(
|
||||
n_nodes_pp = ggml_graph_n_nodes(gf);
|
||||
}
|
||||
|
||||
// reserve with tg graph to get the number of splits and nodes
|
||||
// reserve with tg (token generation) graph to get the number of splits and nodes
|
||||
{
|
||||
auto * gf = graph_reserve(1, 1, 1, mctx.get());
|
||||
auto * gf = graph_reserve(n_seqs, n_seqs, n_seqs, mctx.get());
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute tg buffers");
|
||||
}
|
||||
@ -311,6 +322,10 @@ llama_context::llama_context(
|
||||
|
||||
// reserve again with pp graph to avoid ggml-alloc reallocations during inference
|
||||
{
|
||||
// TODO: not sure if the following graph would be worster case for multi-stream KV caches:
|
||||
//
|
||||
// auto * gf = graph_reserve(n_tokens, 1, n_tokens, mctx.get());
|
||||
//
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
if (!gf) {
|
||||
throw std::runtime_error("failed to allocate compute pp buffers");
|
||||
@ -388,10 +403,6 @@ ggml_backend_sched_t llama_context::get_sched() const {
|
||||
return sched.get();
|
||||
}
|
||||
|
||||
ggml_context * llama_context::get_ctx_compute() const {
|
||||
return ctx_compute.get();
|
||||
}
|
||||
|
||||
uint32_t llama_context::n_ctx() const {
|
||||
return cparams.n_ctx;
|
||||
}
|
||||
@ -463,6 +474,11 @@ bool llama_context::kv_self_update(bool optimize) {
|
||||
}
|
||||
}
|
||||
|
||||
// reset the previous graph result to make sure that it won't be reused
|
||||
// TODO: change the mctx->apply() to return information if a graph reserve is needed
|
||||
// reset the graph result only if the memory module did reset the scheduler
|
||||
gf_res_prev->reset();
|
||||
|
||||
if (!mctx->apply()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply memory update\n", __func__);
|
||||
}
|
||||
@ -475,7 +491,7 @@ bool llama_context::kv_self_update(bool optimize) {
|
||||
throw std::runtime_error("failed to initialize memory context");
|
||||
}
|
||||
|
||||
const uint32_t n_seqs = cparams.n_seq_max;
|
||||
const uint32_t n_seqs = cparams.kv_unified ? 1 : cparams.n_seq_max;
|
||||
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
|
||||
|
||||
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, mctx.get());
|
||||
@ -492,12 +508,16 @@ enum llama_pooling_type llama_context::pooling_type() const {
|
||||
}
|
||||
|
||||
float * llama_context::get_logits() {
|
||||
output_reorder();
|
||||
|
||||
return logits;
|
||||
}
|
||||
|
||||
float * llama_context::get_logits_ith(int32_t i) {
|
||||
int64_t j = -1;
|
||||
|
||||
output_reorder();
|
||||
|
||||
try {
|
||||
if (logits == nullptr) {
|
||||
throw std::runtime_error("no logits");
|
||||
@ -534,12 +554,16 @@ float * llama_context::get_logits_ith(int32_t i) {
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings() {
|
||||
output_reorder();
|
||||
|
||||
return embd;
|
||||
}
|
||||
|
||||
float * llama_context::get_embeddings_ith(int32_t i) {
|
||||
int64_t j = -1;
|
||||
|
||||
output_reorder();
|
||||
|
||||
try {
|
||||
if (embd == nullptr) {
|
||||
throw std::runtime_error("no embeddings");
|
||||
@ -678,38 +702,59 @@ bool llama_context::apply_adapter_cvec(
|
||||
return cvec.apply(model, data, len, n_embd, il_start, il_end);
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||
llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, llm_graph_type gtype, llama_memory_context_i * mctx, ggml_status & ret) {
|
||||
if (mctx && !mctx->apply()) {
|
||||
LLAMA_LOG_ERROR("%s: failed to apply memory context\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto * gf = graph_init();
|
||||
auto * res = gf_res_prev.get();
|
||||
auto * gf = res->get_gf();
|
||||
|
||||
// the new graph parameters
|
||||
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
||||
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
||||
|
||||
if (res->can_reuse(gparams)) {
|
||||
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
||||
|
||||
n_reused++;
|
||||
} else {
|
||||
res->reset();
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
//const auto t_start_us = ggml_time_us();
|
||||
|
||||
gf = model.build_graph(gparams);
|
||||
|
||||
//LLAMA_LOG_INFO("graph build time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
|
||||
|
||||
if (!gf) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize graph\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, gtype, mctx);
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build graph\n", __func__);
|
||||
ret = GGML_STATUS_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
||||
|
||||
if (!ggml_backend_sched_alloc_graph(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate graph\n", __func__);
|
||||
ret = GGML_STATUS_ALLOC_FAILED;
|
||||
return nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
// set the input data for the input tensors
|
||||
{
|
||||
//const auto t_start_us = ggml_time_us();
|
||||
|
||||
res->set_inputs(&ubatch);
|
||||
|
||||
const auto status = graph_compute(gf, ubatch.n_tokens > 1);
|
||||
//LLAMA_LOG_INFO("graph set inputs time: %.3f ms\n", (ggml_time_us() - t_start_us)/1000.0);
|
||||
}
|
||||
|
||||
const auto status = graph_compute(res->get_gf(), ubatch.n_tokens > 1);
|
||||
if (status != GGML_STATUS_SUCCESS) {
|
||||
LLAMA_LOG_ERROR("%s: failed to compute graph, compute status: %d\n", __func__, status);
|
||||
ret = status;
|
||||
@ -732,15 +777,18 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
const auto & hparams = model.hparams;
|
||||
|
||||
const int64_t n_embd = hparams.n_embd;
|
||||
const int32_t n_vocab = model.vocab.n_tokens();
|
||||
|
||||
// note: during encode, we always pass the full sequence starting from pos = 0
|
||||
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, true)) {
|
||||
if (!balloc->init(batch_inp, model.vocab, nullptr, n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
|
||||
const uint32_t n_tokens = balloc->get_n_tokens();
|
||||
|
||||
// [TAG_NO_CACHE_PAD]
|
||||
// TODO: add new split mode where we pad the input sequences so that ubatch.equal_seqs == true
|
||||
const llama_ubatch ubatch = balloc->split_simple(n_tokens);
|
||||
|
||||
// micro-batching is not possible for non-causal encoding, so we process the batch in a single shot
|
||||
@ -767,9 +815,6 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
|
||||
n_outputs = n_tokens;
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
const auto causal_attn_org = cparams.causal_attn;
|
||||
|
||||
// always use non-causal attention for encoder graphs
|
||||
@ -778,7 +823,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
cparams.causal_attn = false;
|
||||
|
||||
ggml_status status;
|
||||
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
|
||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_ENCODER, nullptr, status);
|
||||
|
||||
cparams.causal_attn = causal_attn_org;
|
||||
|
||||
@ -791,10 +836,20 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
}
|
||||
}
|
||||
|
||||
auto * t_logits = res->get_logits();
|
||||
auto * t_embd = res->get_embd_pooled() ? res->get_embd_pooled() : res->get_embd();
|
||||
|
||||
// extract logits
|
||||
if (logits && t_logits) {
|
||||
ggml_backend_t backend_res = ggml_backend_sched_get_tensor_backend(sched.get(), t_logits);
|
||||
GGML_ASSERT(backend_res != nullptr);
|
||||
GGML_ASSERT(logits != nullptr);
|
||||
|
||||
ggml_backend_tensor_get_async(backend_res, t_logits, logits, 0, n_tokens*n_vocab*sizeof(float));
|
||||
}
|
||||
|
||||
// extract embeddings
|
||||
if (t_embd) {
|
||||
if (embd && t_embd) {
|
||||
ggml_backend_t backend_embd = ggml_backend_sched_get_tensor_backend(sched.get(), t_embd);
|
||||
GGML_ASSERT(backend_embd != nullptr);
|
||||
|
||||
@ -844,9 +899,11 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
||||
}
|
||||
}
|
||||
|
||||
if (!supports_set_rows) {
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
// TODO: hacky solution
|
||||
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
||||
@ -899,7 +956,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
// when computing embeddings, all tokens are output
|
||||
const bool output_all = cparams.embeddings;
|
||||
|
||||
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, output_all)) {
|
||||
if (!balloc->init(batch_inp, vocab, memory.get(), n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, output_all)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
return -1;
|
||||
}
|
||||
@ -927,6 +984,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
|
||||
// TODO: this clear of the buffer can easily be forgotten - need something better
|
||||
embd_seq.clear();
|
||||
output_swaps.clear();
|
||||
|
||||
bool did_optimize = false;
|
||||
|
||||
@ -1005,11 +1063,8 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
n_outputs = n_outputs_new;
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
|
||||
|
||||
ggml_status status;
|
||||
const auto res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||
const auto * res = process_ubatch(ubatch, LLM_GRAPH_TYPE_DECODER, mctx.get(), status);
|
||||
|
||||
if (!res) {
|
||||
// the last ubatch failed or was aborted -> remove all positions of that ubatch from the KV cache
|
||||
@ -1149,9 +1204,6 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
// make the outputs have the same order they had in the user-provided batch
|
||||
// note: this is mostly relevant for recurrent models atm
|
||||
if (!sorted_output) {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint64_t n_embd = model.hparams.n_embd;
|
||||
|
||||
GGML_ASSERT((size_t) n_outputs == out_ids.size());
|
||||
|
||||
// TODO: is there something more efficient which also minimizes swaps?
|
||||
@ -1167,16 +1219,9 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
continue;
|
||||
}
|
||||
std::swap(out_ids[i], out_ids[j_min]);
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(logits[i*n_vocab + k], logits[j_min*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
if (embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd[i*n_embd + k], embd[j_min*n_embd + k]);
|
||||
}
|
||||
}
|
||||
|
||||
// remember the swaps and apply them lazily upon logits/embeddings access
|
||||
output_swaps.push_back({ i, j_min });
|
||||
}
|
||||
|
||||
std::fill(output_ids.begin(), output_ids.end(), -1);
|
||||
@ -1190,9 +1235,11 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||
//synchronize();
|
||||
|
||||
if (!supports_set_rows) {
|
||||
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||
// overlap with device computation.
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -1271,24 +1318,40 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
||||
return n_outputs_max;
|
||||
}
|
||||
|
||||
void llama_context::output_reorder() {
|
||||
const uint32_t n_vocab = model.vocab.n_tokens();
|
||||
const uint64_t n_embd = model.hparams.n_embd;
|
||||
|
||||
for (uint32_t s = 0; s < output_swaps.size(); ++s) {
|
||||
const uint32_t i0 = output_swaps[s].i0;
|
||||
const uint32_t i1 = output_swaps[s].i1;
|
||||
|
||||
if (logits_size > 0) {
|
||||
for (uint32_t k = 0; k < n_vocab; k++) {
|
||||
std::swap(logits[i0*n_vocab + k], logits[i1*n_vocab + k]);
|
||||
}
|
||||
}
|
||||
|
||||
if (embd_size > 0) {
|
||||
for (uint32_t k = 0; k < n_embd; k++) {
|
||||
std::swap(embd[i0*n_embd + k], embd[i1*n_embd + k]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
output_swaps.clear();
|
||||
}
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
|
||||
int32_t llama_context::graph_max_nodes() const {
|
||||
return std::max<int32_t>(65536, 5*model.n_tensors());
|
||||
uint32_t llama_context::graph_max_nodes() const {
|
||||
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_init() {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ buf_compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ctx_compute.reset(ggml_init(params));
|
||||
|
||||
return ggml_new_graph_custom(ctx_compute.get(), graph_max_nodes(), false);
|
||||
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||
return static_cast<llm_graph_result *>(gf_res_reserve.get());
|
||||
}
|
||||
|
||||
ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx) {
|
||||
@ -1301,6 +1364,11 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
LLAMA_LOG_DEBUG("%s: making n_tokens a multiple of n_seqs - n_tokens = %u, n_seqs = %u, n_outputs = %u\n", __func__, n_tokens, n_seqs, n_outputs);
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
// when the scheduler is reset, we cannnot reuse the old graph, so we reset the previous graph result to prevent that
|
||||
gf_res_prev->reset();
|
||||
|
||||
// store the n_outputs as it is, and restore it afterwards
|
||||
// TODO: not sure if needed, might simplify in the future by removing this
|
||||
const auto save_n_outputs = this->n_outputs;
|
||||
@ -1310,18 +1378,16 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
llama_batch_allocr balloc(model.hparams.n_pos_per_embd());
|
||||
llama_ubatch ubatch = balloc.ubatch_reserve(n_tokens/n_seqs, n_seqs);
|
||||
|
||||
auto * gf = graph_init();
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx);
|
||||
auto * res = gf_res_reserve.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx, LLM_GRAPH_TYPE_DEFAULT);
|
||||
|
||||
res->reset();
|
||||
|
||||
auto * gf = model.build_graph(gparams);
|
||||
|
||||
this->n_outputs = save_n_outputs;
|
||||
|
||||
if (!res) {
|
||||
LLAMA_LOG_ERROR("%s: failed to build worst-case graph\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched.get());
|
||||
|
||||
// initialize scheduler with the specified graph
|
||||
if (!ggml_backend_sched_reserve(sched.get(), gf)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
|
||||
@ -1331,19 +1397,17 @@ ggml_cgraph * llama_context::graph_reserve(uint32_t n_tokens, uint32_t n_seqs, u
|
||||
return gf;
|
||||
}
|
||||
|
||||
llm_graph_result_ptr llama_context::graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
llm_graph_params llama_context::graph_params(
|
||||
llm_graph_result * res,
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype,
|
||||
const llama_memory_context_i * mctx) {
|
||||
return model.build_graph(
|
||||
{
|
||||
/*.ctx =*/ ctx,
|
||||
const llama_memory_context_i * mctx,
|
||||
llm_graph_type gtype) const {
|
||||
return {
|
||||
/*.arch =*/ model.arch,
|
||||
/*.hparams =*/ model.hparams,
|
||||
/*.cparams =*/ cparams,
|
||||
/*.ubatch =*/ ubatch,
|
||||
/*.gtype =*/ gtype,
|
||||
/*.sched =*/ sched.get(),
|
||||
/*.backend_cpu =*/ backend_cpu,
|
||||
/*.cvec =*/ &cvec,
|
||||
@ -1352,7 +1416,8 @@ llm_graph_result_ptr llama_context::graph_build(
|
||||
/*.cross =*/ &cross,
|
||||
/*.n_outputs =*/ n_outputs,
|
||||
/*.cb =*/ graph_get_cb(),
|
||||
}, gf, gtype);
|
||||
/*.res =*/ res,
|
||||
};
|
||||
}
|
||||
|
||||
ggml_status llama_context::graph_compute(
|
||||
@ -1930,6 +1995,7 @@ llama_perf_context_data llama_context::perf_get_data() const {
|
||||
data.t_eval_ms = 1e-3 * t_eval_us;
|
||||
data.n_p_eval = std::max(1, n_p_eval);
|
||||
data.n_eval = std::max(1, n_eval);
|
||||
data.n_reused = std::max(0, n_reused);
|
||||
|
||||
return data;
|
||||
}
|
||||
@ -1938,6 +2004,7 @@ void llama_context::perf_reset() {
|
||||
t_start_us = ggml_time_us();
|
||||
t_eval_us = n_eval = 0;
|
||||
t_p_eval_us = n_p_eval = 0;
|
||||
n_reused = 0;
|
||||
}
|
||||
|
||||
//
|
||||
@ -2028,7 +2095,7 @@ void llama_context::opt_epoch_iter(
|
||||
batch.logits [pos_batch] = true;
|
||||
}
|
||||
|
||||
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, true)) {
|
||||
if (!balloc->init(batch, model.vocab, nullptr, model.hparams.n_embd, cparams.kv_unified ? LLAMA_MAX_SEQ : cparams.n_seq_max, true)) {
|
||||
LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
|
||||
return;
|
||||
}
|
||||
@ -2064,8 +2131,13 @@ void llama_context::opt_epoch_iter(
|
||||
break;
|
||||
}
|
||||
|
||||
auto * gf = graph_init();
|
||||
auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_DEFAULT, mctx.get());
|
||||
auto * res = gf_res_prev.get();
|
||||
|
||||
const auto gparams = graph_params(res, ubatch, mctx.get(), LLM_GRAPH_TYPE_DEFAULT);
|
||||
|
||||
res->reset();
|
||||
|
||||
auto * gf = model.build_graph(gparams);
|
||||
|
||||
struct ggml_context * ctx_compute_opt;
|
||||
{
|
||||
@ -2187,6 +2259,7 @@ llama_context_params llama_context_default_params() {
|
||||
/*.no_perf =*/ true,
|
||||
/*.op_offload =*/ true,
|
||||
/*.swa_full =*/ true,
|
||||
/*.kv_unified =*/ false,
|
||||
};
|
||||
|
||||
return result;
|
||||
@ -2807,6 +2880,7 @@ void llama_perf_context_print(const llama_context * ctx) {
|
||||
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
||||
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
|
||||
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
|
||||
LLAMA_LOG_INFO("%s: graphs reused = %10d\n", __func__, data.n_reused);
|
||||
}
|
||||
|
||||
void llama_perf_context_reset(llama_context * ctx) {
|
||||
|
@ -35,8 +35,6 @@ struct llama_context {
|
||||
|
||||
ggml_backend_sched_t get_sched() const;
|
||||
|
||||
ggml_context * get_ctx_compute() const;
|
||||
|
||||
uint32_t n_ctx() const;
|
||||
uint32_t n_ctx_per_seq() const;
|
||||
uint32_t n_batch() const;
|
||||
@ -96,7 +94,7 @@ struct llama_context {
|
||||
// if memory_context is provided, it will be applied first to the context's memory
|
||||
// ret contains the status of the graph computation
|
||||
// returns nullptr only if ret != GGML_STATUS_SUCCESS
|
||||
llm_graph_result_ptr process_ubatch(
|
||||
llm_graph_result * process_ubatch(
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype,
|
||||
llama_memory_context_i * mctx,
|
||||
@ -183,15 +181,17 @@ private:
|
||||
// Returns max number of outputs for which space was reserved.
|
||||
uint32_t output_reserve(int32_t n_outputs);
|
||||
|
||||
void output_reorder();
|
||||
|
||||
//
|
||||
// graph
|
||||
//
|
||||
|
||||
public:
|
||||
int32_t graph_max_nodes() const;
|
||||
uint32_t graph_max_nodes() const;
|
||||
|
||||
// zero-out inputs and create the ctx_compute for the compute graph
|
||||
ggml_cgraph * graph_init();
|
||||
// can reuse the llm_graph_result instance of the context (for example to update a memory module)
|
||||
llm_graph_result * get_gf_res_reserve() const;
|
||||
|
||||
// returns the result of ggml_backend_sched_graph_compute_async execution
|
||||
ggml_status graph_compute(ggml_cgraph * gf, bool batched);
|
||||
@ -200,12 +200,11 @@ public:
|
||||
ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_context_i * mctx);
|
||||
|
||||
private:
|
||||
llm_graph_result_ptr graph_build(
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
llm_graph_params graph_params(
|
||||
llm_graph_result * res,
|
||||
const llama_ubatch & ubatch,
|
||||
llm_graph_type gtype,
|
||||
const llama_memory_context_i * mctx);
|
||||
const llama_memory_context_i * mctx,
|
||||
llm_graph_type gtype) const;
|
||||
|
||||
llm_graph_cb graph_get_cb() const;
|
||||
|
||||
@ -253,13 +252,18 @@ private:
|
||||
|
||||
std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
|
||||
|
||||
struct swap_info {
|
||||
uint32_t i0;
|
||||
uint32_t i1;
|
||||
};
|
||||
|
||||
std::vector<swap_info> output_swaps;
|
||||
|
||||
ggml_backend_sched_ptr sched;
|
||||
|
||||
ggml_backend_t backend_cpu = nullptr;
|
||||
std::vector<ggml_backend_ptr> backends;
|
||||
|
||||
ggml_context_ptr ctx_compute;
|
||||
|
||||
// training
|
||||
ggml_opt_context_t opt_ctx = nullptr;
|
||||
|
||||
@ -275,14 +279,18 @@ private:
|
||||
std::vector<ggml_backend_t> backend_ptrs;
|
||||
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
llm_graph_result_ptr gf_res_prev;
|
||||
llm_graph_result_ptr gf_res_reserve;
|
||||
|
||||
// host buffer for the model output (logits and embeddings)
|
||||
ggml_backend_buffer_ptr buf_output;
|
||||
|
||||
bool has_evaluated_once = false;
|
||||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
bool supports_set_rows = false;
|
||||
|
||||
// perf
|
||||
mutable int64_t t_start_us = 0;
|
||||
mutable int64_t t_load_us = 0;
|
||||
@ -294,4 +302,6 @@ private:
|
||||
|
||||
mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
|
||||
mutable int32_t n_eval = 0; // number of eval calls
|
||||
|
||||
mutable int32_t n_reused = 0; // number of times the previous graph was reused
|
||||
};
|
||||
|
@ -11,8 +11,8 @@ struct llama_cparams {
|
||||
uint32_t n_batch;
|
||||
uint32_t n_ubatch;
|
||||
uint32_t n_seq_max;
|
||||
int n_threads; // number of threads to use for generation
|
||||
int n_threads_batch; // number of threads to use for batch processing
|
||||
int32_t n_threads; // number of threads to use for generation
|
||||
int32_t n_threads_batch; // number of threads to use for batch processing
|
||||
|
||||
float rope_freq_base;
|
||||
float rope_freq_scale;
|
||||
@ -33,6 +33,7 @@ struct llama_cparams {
|
||||
bool no_perf;
|
||||
bool warmup;
|
||||
bool op_offload;
|
||||
bool kv_unified;
|
||||
|
||||
enum llama_pooling_type pooling_type;
|
||||
|
||||
|
@ -28,6 +28,15 @@ void llm_graph_input_embd::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_embd::can_reuse(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= (!tokens && !params.ubatch.token) || (tokens && tokens->ne[0] == params.ubatch.n_tokens);
|
||||
res &= (!embd && !params.ubatch.embd) || (embd && embd->ne[0] == params.ubatch.n_tokens);
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->pos && pos) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@ -50,6 +59,14 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= pos->ne[0] == params.ubatch.n_tokens;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
|
||||
if (ubatch->pos && attn_scale) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@ -71,7 +88,7 @@ void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(pos_bucket->buffer));
|
||||
GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
|
||||
GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
|
||||
|
||||
int32_t * data = (int32_t *) pos_bucket->data;
|
||||
|
||||
@ -118,6 +135,14 @@ void llm_graph_input_out_ids::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_input_out_ids::can_reuse(const llm_graph_params & params) {
|
||||
bool res = true;
|
||||
|
||||
res &= n_outputs == params.n_outputs;
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_mean::set_input(const llama_ubatch * ubatch) {
|
||||
if (cparams.embeddings && cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
@ -287,6 +312,24 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->set_input_kq_mask(self_kq_mask, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv_unified::can_reuse(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_unified_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
bool res = true;
|
||||
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= mctx->get_supports_set_rows(); // TODO: tmp
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch) {
|
||||
mctx->get_base()->set_input_k_idxs(self_k_idxs, ubatch);
|
||||
mctx->get_base()->set_input_v_idxs(self_v_idxs, ubatch);
|
||||
@ -299,6 +342,30 @@ void llm_graph_input_attn_kv_unified_iswa::set_input(const llama_ubatch * ubatch
|
||||
mctx->get_swa()->set_input_kq_mask(self_kq_mask_swa, ubatch, cparams.causal_attn);
|
||||
}
|
||||
|
||||
bool llm_graph_input_attn_kv_unified_iswa::can_reuse(const llm_graph_params & params) {
|
||||
const auto * mctx = static_cast<const llama_kv_cache_unified_iswa_context *>(params.mctx);
|
||||
|
||||
this->mctx = mctx;
|
||||
|
||||
bool res = true;
|
||||
|
||||
res &= self_k_idxs->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= self_k_idxs_swa->ne[0] == params.ubatch.n_tokens;
|
||||
//res &= self_v_idxs_swa->ne[0] == params.ubatch.n_tokens; // TODO: need to move this to the unified cache and check there
|
||||
|
||||
res &= self_kq_mask->ne[0] == mctx->get_base()->get_n_kv();
|
||||
res &= self_kq_mask->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= self_kq_mask_swa->ne[0] == mctx->get_swa()->get_n_kv();
|
||||
res &= self_kq_mask_swa->ne[1] == GGML_PAD(params.ubatch.n_tokens, GGML_KQ_MASK_PAD);
|
||||
|
||||
res &= mctx->get_base()->get_supports_set_rows(); // TODO: tmp
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
GGML_ASSERT(cross_kq_mask);
|
||||
|
||||
@ -306,7 +373,7 @@ void llm_graph_input_attn_cross::set_input(const llama_ubatch * ubatch) {
|
||||
const int64_t n_tokens = ubatch->n_tokens;
|
||||
|
||||
GGML_ASSERT(ggml_backend_buffer_is_host(cross_kq_mask->buffer));
|
||||
GGML_ASSERT(!ubatch->equal_seqs); // TODO: use ubatch->n_seqs instead of failing
|
||||
GGML_ASSERT(!ubatch->equal_seqs()); // TODO: use ubatch->n_seqs instead of failing
|
||||
|
||||
float * data = (float *) cross_kq_mask->data;
|
||||
|
||||
@ -340,6 +407,91 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
||||
inp_rs->set_input(ubatch);
|
||||
}
|
||||
|
||||
//
|
||||
// llm_graph_result
|
||||
//
|
||||
|
||||
llm_graph_result::llm_graph_result(int64_t max_nodes) : max_nodes(max_nodes) {
|
||||
reset();
|
||||
|
||||
const char * LLAMA_GRAPH_RESULT_DEBUG = getenv("LLAMA_GRAPH_RESULT_DEBUG");
|
||||
debug = LLAMA_GRAPH_RESULT_DEBUG ? atoi(LLAMA_GRAPH_RESULT_DEBUG) : 0;
|
||||
}
|
||||
|
||||
int64_t llm_graph_result::get_max_nodes() const {
|
||||
return max_nodes;
|
||||
}
|
||||
|
||||
void llm_graph_result::reset() {
|
||||
t_tokens = nullptr;
|
||||
t_logits = nullptr;
|
||||
t_embd = nullptr;
|
||||
t_embd_pooled = nullptr;
|
||||
|
||||
params = {};
|
||||
|
||||
inputs.clear();
|
||||
|
||||
buf_compute_meta.resize(ggml_tensor_overhead()*max_nodes + ggml_graph_overhead_custom(max_nodes, false));
|
||||
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ buf_compute_meta.size(),
|
||||
/*.mem_buffer =*/ buf_compute_meta.data(),
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
|
||||
ctx_compute.reset(ggml_init(params));
|
||||
|
||||
gf = ggml_new_graph_custom(ctx_compute.get(), max_nodes, false);
|
||||
}
|
||||
|
||||
void llm_graph_result::set_inputs(const llama_ubatch * ubatch) {
|
||||
for (auto & input : inputs) {
|
||||
input->set_input(ubatch);
|
||||
}
|
||||
}
|
||||
|
||||
bool llm_graph_result::can_reuse(const llm_graph_params & params) {
|
||||
if (!this->params.allow_reuse(params)) {
|
||||
if (debug > 1) {
|
||||
LLAMA_LOG_DEBUG("%s: cannot reuse graph due to incompatible graph parameters\n", __func__);
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
if (debug > 1) {
|
||||
LLAMA_LOG_DEBUG("%s: checking compatibility of %d inputs:\n", __func__, (int) inputs.size());
|
||||
}
|
||||
|
||||
bool res = true;
|
||||
|
||||
for (auto & input : inputs) {
|
||||
const bool cur = input->can_reuse(params);
|
||||
|
||||
if (debug > 1) {
|
||||
LLAMA_LOG_DEBUG("%s: can_reuse = %d\n", "placeholder", cur);
|
||||
}
|
||||
|
||||
res = res && cur;
|
||||
}
|
||||
|
||||
if (debug > 0) {
|
||||
LLAMA_LOG_DEBUG("%s: can reuse graph = %d\n", __func__, res);
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
llm_graph_input_i * llm_graph_result::add_input(llm_graph_input_ptr input) {
|
||||
inputs.emplace_back(std::move(input));
|
||||
return inputs.back().get();
|
||||
}
|
||||
|
||||
void llm_graph_result::set_params(const llm_graph_params & params) {
|
||||
this->params = params;
|
||||
}
|
||||
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
@ -374,7 +526,6 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||
n_ctx_orig (cparams.n_ctx_orig_yarn),
|
||||
pooling_type (cparams.pooling_type),
|
||||
rope_type (hparams.rope_type),
|
||||
ctx0 (params.ctx),
|
||||
sched (params.sched),
|
||||
backend_cpu (params.backend_cpu),
|
||||
cvec (params.cvec),
|
||||
@ -382,7 +533,10 @@ llm_graph_context::llm_graph_context(const llm_graph_params & params) :
|
||||
mctx (params.mctx),
|
||||
cross (params.cross),
|
||||
cb_func (params.cb),
|
||||
res (std::make_unique<llm_graph_result>()) {
|
||||
res (params.res),
|
||||
ctx0 (res->get_ctx()),
|
||||
gf (res->get_gf()) {
|
||||
res->set_params(params);
|
||||
}
|
||||
|
||||
void llm_graph_context::cb(ggml_tensor * cur, const char * name, int il) const {
|
||||
@ -753,20 +907,28 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
}
|
||||
|
||||
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
|
||||
|
||||
assert(n_expert_used > 0);
|
||||
|
||||
// order the views before the adds
|
||||
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
|
||||
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
|
||||
|
||||
ggml_build_forward_expand(gf, cur_experts[i]);
|
||||
}
|
||||
|
||||
// aggregate experts
|
||||
ggml_tensor * moe_out = nullptr;
|
||||
for (int i = 0; i < n_expert_used; ++i) {
|
||||
ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
|
||||
experts->nb[2], i*experts->nb[1]);
|
||||
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
|
||||
// to avoid potentially a large number of add nodes during warmup
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
|
||||
ggml_tensor * moe_out = cur_experts[0];
|
||||
|
||||
if (i == 0) {
|
||||
moe_out = cur_expert;
|
||||
} else {
|
||||
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
||||
}
|
||||
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
|
||||
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
|
||||
}
|
||||
|
||||
if (n_expert_used == 1) {
|
||||
if (hparams.n_expert_used == 1) {
|
||||
// avoid returning a non-contiguous tensor
|
||||
moe_out = ggml_cont(ctx0, moe_out);
|
||||
}
|
||||
@ -972,7 +1134,6 @@ ggml_tensor * llm_graph_context::build_pos_bias(ggml_tensor * pos_bucket, ggml_t
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * q,
|
||||
ggml_tensor * k,
|
||||
ggml_tensor * v,
|
||||
@ -982,12 +1143,15 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
float kq_scale) const {
|
||||
const bool v_trans = v->nb[1] > v->nb[2];
|
||||
|
||||
// split the batch into streams if needed
|
||||
const auto n_stream = k->ne[3];
|
||||
|
||||
q = ggml_reshape_4d(ctx0, q, q->ne[0], q->ne[1], q->ne[2]/n_stream, n_stream);
|
||||
|
||||
q = ggml_permute(ctx0, q, 0, 2, 1, 3);
|
||||
k = ggml_permute(ctx0, k, 0, 2, 1, 3);
|
||||
v = ggml_permute(ctx0, v, 0, 2, 1, 3);
|
||||
|
||||
const auto n_tokens = q->ne[1];
|
||||
const auto n_head = q->ne[2];
|
||||
const auto n_kv = k->ne[1];
|
||||
|
||||
ggml_tensor * cur;
|
||||
@ -1030,7 +1194,7 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
#endif
|
||||
}
|
||||
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
||||
cur = ggml_reshape_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
|
||||
} else {
|
||||
ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
|
||||
|
||||
@ -1075,7 +1239,8 @@ ggml_tensor * llm_graph_context::build_attn_mha(
|
||||
|
||||
cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
|
||||
|
||||
cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
|
||||
// recombine streams
|
||||
cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*cur->ne[1], cur->ne[2]*cur->ne[3]);
|
||||
|
||||
if (!cparams.offload_kqv) {
|
||||
// all nodes between the KV store and the attention output are run on the CPU
|
||||
@ -1102,7 +1267,6 @@ llm_graph_input_attn_no_cache * llm_graph_context::build_attn_inp_no_cache() con
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_no_cache * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
@ -1122,11 +1286,15 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
|
||||
const auto & kq_mask = inp->get_kq_mask();
|
||||
|
||||
// [TAG_NO_CACHE_PAD]
|
||||
// TODO: if ubatch.equal_seqs() == true, we can split the three tensors below into ubatch.n_seqs_unq streams
|
||||
assert(!ubatch.equal_seqs());
|
||||
|
||||
ggml_tensor * q = q_cur;
|
||||
ggml_tensor * k = k_cur;
|
||||
ggml_tensor * v = v_cur;
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
@ -1158,11 +1326,12 @@ static std::unique_ptr<llm_graph_input_attn_kv_unified> build_attn_inp_kv_unifie
|
||||
|
||||
const auto n_kv = mctx_cur->get_n_kv();
|
||||
const auto n_tokens = ubatch.n_tokens;
|
||||
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
|
||||
|
||||
inp->self_k_idxs = mctx_cur->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs = mctx_cur->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
@ -1181,7 +1350,6 @@ llm_graph_input_attn_kv_unified * llm_graph_context::build_attn_inp_kv_unified()
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_kv_unified * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
@ -1214,7 +1382,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
@ -1234,7 +1402,6 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
@ -1281,7 +1448,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k = mctx_cur->get_k(ctx0, il);
|
||||
ggml_tensor * v = mctx_cur->get_v(ctx0, il);
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
@ -1314,7 +1481,6 @@ llm_graph_input_attn_cross * llm_graph_context::build_attn_inp_cross() const {
|
||||
|
||||
ggml_tensor * llm_graph_context::build_attn(
|
||||
llm_graph_input_attn_cross * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur,
|
||||
@ -1336,7 +1502,7 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
ggml_tensor * k = k_cur;
|
||||
ggml_tensor * v = v_cur;
|
||||
|
||||
ggml_tensor * cur = build_attn_mha(gf, q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
||||
ggml_tensor * cur = build_attn_mha(q, k, v, kq_b, kq_mask, v_mla, kq_scale);
|
||||
cb(cur, "kqv_out", il);
|
||||
|
||||
if (wo) {
|
||||
@ -1362,13 +1528,15 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
||||
|
||||
auto inp = std::make_unique<llm_graph_input_attn_kv_unified_iswa>(hparams, cparams, mctx_cur);
|
||||
|
||||
const auto n_stream = cparams.kv_unified ? 1 : ubatch.n_seqs_unq;
|
||||
|
||||
{
|
||||
const auto n_kv = mctx_cur->get_base()->get_n_kv();
|
||||
|
||||
inp->self_k_idxs = mctx_cur->get_base()->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs = mctx_cur->get_base()->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
||||
inp->self_kq_mask = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask);
|
||||
|
||||
inp->self_kq_mask_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask, GGML_TYPE_F16) : inp->self_kq_mask;
|
||||
@ -1382,7 +1550,7 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
||||
inp->self_k_idxs_swa = mctx_cur->get_swa()->build_input_k_idxs(ctx0, ubatch);
|
||||
inp->self_v_idxs_swa = mctx_cur->get_swa()->build_input_v_idxs(ctx0, ubatch);
|
||||
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens, GGML_KQ_MASK_PAD), 1, 1);
|
||||
inp->self_kq_mask_swa = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_kv, GGML_PAD(n_tokens/n_stream, GGML_KQ_MASK_PAD), 1, n_stream);
|
||||
ggml_set_input(inp->self_kq_mask_swa);
|
||||
|
||||
inp->self_kq_mask_swa_cnv = cparams.flash_attn ? ggml_cast(ctx0, inp->self_kq_mask_swa, GGML_TYPE_F16) : inp->self_kq_mask_swa;
|
||||
@ -1392,7 +1560,6 @@ llm_graph_input_attn_kv_unified_iswa * llm_graph_context::build_attn_inp_kv_unif
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_rs(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
ggml_tensor * state_copy,
|
||||
int32_t state_size,
|
||||
@ -1450,19 +1617,17 @@ llm_graph_input_rs * llm_graph_context::build_rs_inp() const {
|
||||
|
||||
ggml_tensor * llm_graph_context::build_rs(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
int32_t state_size,
|
||||
int32_t n_seqs,
|
||||
const llm_graph_get_rows_fn & get_state_rows) const {
|
||||
const auto * kv_state = inp->mctx;
|
||||
|
||||
return build_rs(gf, s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
|
||||
return build_rs(s, inp->s_copy, state_size, n_seqs, kv_state->get_n_rs(), kv_state->get_head(), kv_state->get_size(), kv_state->get_rs_z(), get_state_rows);
|
||||
}
|
||||
|
||||
ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const {
|
||||
const auto * mctx_cur = static_cast<const llama_memory_recurrent_context *>(mctx);
|
||||
@ -1474,7 +1639,7 @@ ggml_tensor * llm_graph_context::build_rwkv_token_shift_load(
|
||||
ggml_tensor * token_shift_all = mctx_cur->get_r_l(il);
|
||||
|
||||
ggml_tensor * token_shift = build_rs(
|
||||
inp, gf, token_shift_all,
|
||||
inp, token_shift_all,
|
||||
hparams.n_embd_r(), n_seqs);
|
||||
|
||||
token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs);
|
||||
@ -1514,7 +1679,6 @@ llm_graph_input_mem_hybrid * llm_graph_context::build_inp_mem_hybrid() const {
|
||||
}
|
||||
|
||||
void llm_graph_context::build_pooling(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cls,
|
||||
ggml_tensor * cls_b,
|
||||
ggml_tensor * cls_out,
|
||||
|
@ -1,6 +1,7 @@
|
||||
#pragma once
|
||||
|
||||
#include "llama-arch.h"
|
||||
#include "llama-batch.h"
|
||||
#include "llama-hparams.h"
|
||||
#include "llama-adapter.h"
|
||||
|
||||
@ -14,7 +15,6 @@ struct ggml_cgraph;
|
||||
struct ggml_context;
|
||||
struct ggml_tensor;
|
||||
|
||||
struct llama_ubatch;
|
||||
struct llama_cparams;
|
||||
|
||||
struct llama_memory_context_i;
|
||||
@ -69,6 +69,8 @@ struct llama_cross {
|
||||
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
||||
};
|
||||
|
||||
struct llm_graph_params;
|
||||
|
||||
//
|
||||
// llm_graph_input
|
||||
//
|
||||
@ -78,11 +80,19 @@ public:
|
||||
virtual ~llm_graph_input_i() = default;
|
||||
|
||||
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
||||
|
||||
// return true if the resulting input tensors using the provided graph parameters would be
|
||||
// the same as the previous input tensors that we have currently stored in the object
|
||||
virtual bool can_reuse(const llm_graph_params & params) {
|
||||
// returning false here by default will prevent from reusing the graph if the check
|
||||
// for the input type has not been implemented yet
|
||||
GGML_UNUSED(params);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
||||
|
||||
|
||||
class llm_graph_input_embd : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_embd() = default;
|
||||
@ -90,6 +100,8 @@ public:
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
||||
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
||||
};
|
||||
@ -101,6 +113,8 @@ public:
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
||||
|
||||
const uint32_t n_pos_per_embd = 1;
|
||||
@ -154,17 +168,19 @@ public:
|
||||
llm_graph_input_out_ids(
|
||||
const llama_hparams & hparams,
|
||||
const llama_cparams & cparams,
|
||||
int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
|
||||
uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
|
||||
virtual ~llm_graph_input_out_ids() = default;
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * out_ids; // I32 [n_outputs]
|
||||
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
|
||||
const int32_t n_outputs;
|
||||
const uint32_t n_outputs;
|
||||
};
|
||||
|
||||
class llm_graph_input_mean : public llm_graph_input_i {
|
||||
@ -249,16 +265,18 @@ public:
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
||||
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
||||
|
||||
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
||||
|
||||
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
||||
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch]
|
||||
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
||||
|
||||
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1, 1]
|
||||
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1, 1]
|
||||
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
@ -280,6 +298,8 @@ public:
|
||||
|
||||
void set_input(const llama_ubatch * ubatch) override;
|
||||
|
||||
bool can_reuse(const llm_graph_params & params) override;
|
||||
|
||||
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
||||
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
||||
ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
|
||||
@ -289,14 +309,14 @@ public:
|
||||
ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
|
||||
|
||||
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
||||
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch]
|
||||
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
||||
ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
|
||||
ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch]
|
||||
ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
||||
|
||||
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch, 1, 1]
|
||||
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch, 1, 1]
|
||||
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch, 1, 1]
|
||||
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch, 1, 1]
|
||||
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
||||
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
@ -351,65 +371,20 @@ public:
|
||||
// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
|
||||
// these are used by the llama_context to extact the relevant data, based on the compute parameters
|
||||
|
||||
class llm_graph_result_i {
|
||||
public:
|
||||
virtual ~llm_graph_result_i() = default;
|
||||
|
||||
virtual ggml_tensor * get_tokens() = 0;
|
||||
virtual ggml_tensor * get_logits() = 0;
|
||||
virtual ggml_tensor * get_embd() = 0;
|
||||
virtual ggml_tensor * get_embd_pooled() = 0;
|
||||
|
||||
virtual void set_inputs(const llama_ubatch * ubatch) = 0;
|
||||
};
|
||||
|
||||
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
|
||||
|
||||
|
||||
class llm_graph_result : public llm_graph_result_i {
|
||||
public:
|
||||
virtual ~llm_graph_result() = default;
|
||||
|
||||
ggml_tensor * get_tokens() override { return t_tokens; }
|
||||
ggml_tensor * get_logits() override { return t_logits; }
|
||||
ggml_tensor * get_embd() override { return t_embd; }
|
||||
ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
|
||||
|
||||
void set_inputs(const llama_ubatch * ubatch) override {
|
||||
for (auto & input : inputs) {
|
||||
input->set_input(ubatch);
|
||||
}
|
||||
}
|
||||
|
||||
llm_graph_input_i * add_input(llm_graph_input_ptr input) {
|
||||
inputs.emplace_back(std::move(input));
|
||||
return inputs.back().get();
|
||||
}
|
||||
|
||||
// important graph nodes
|
||||
ggml_tensor * t_tokens = nullptr;
|
||||
ggml_tensor * t_logits = nullptr;
|
||||
ggml_tensor * t_embd = nullptr;
|
||||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
|
||||
std::vector<llm_graph_input_ptr> inputs;
|
||||
};
|
||||
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
|
||||
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
||||
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
|
||||
|
||||
class llm_graph_result;
|
||||
|
||||
struct llm_graph_params {
|
||||
ggml_context * ctx;
|
||||
llm_arch arch = LLM_ARCH_UNKNOWN;
|
||||
|
||||
const llm_arch arch;
|
||||
llama_hparams hparams;
|
||||
llama_cparams cparams;
|
||||
|
||||
const llama_hparams & hparams;
|
||||
const llama_cparams & cparams;
|
||||
const llama_ubatch & ubatch;
|
||||
llama_ubatch ubatch; // note: intentionally make a copy
|
||||
|
||||
llm_graph_type gtype;
|
||||
|
||||
ggml_backend_sched_t sched;
|
||||
ggml_backend_t backend_cpu;
|
||||
@ -421,9 +396,117 @@ struct llm_graph_params {
|
||||
|
||||
uint32_t n_outputs;
|
||||
|
||||
const llm_graph_cb & cb;
|
||||
llm_graph_cb cb;
|
||||
|
||||
llm_graph_result * res;
|
||||
|
||||
// return true if the "other" params would result in a graph with the same topology as with the current params
|
||||
// having the same topology allows us to reuse the graph in some cases
|
||||
bool allow_reuse(const llm_graph_params & other) const {
|
||||
// first check the ubatch
|
||||
bool can_reuse_ubatch =
|
||||
ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
|
||||
ubatch.n_tokens == other.ubatch.n_tokens &&
|
||||
ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
|
||||
ubatch.n_seqs == other.ubatch.n_seqs &&
|
||||
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
|
||||
(
|
||||
(!ubatch.token && !other.ubatch.token) ||
|
||||
(!ubatch.embd && !other.ubatch.embd)
|
||||
);
|
||||
|
||||
if (can_reuse_ubatch && !ubatch.equal_seqs()) {
|
||||
if (!ubatch.data) {
|
||||
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
||||
// therefore we cannot perform the sequence id check. normally should never happen
|
||||
can_reuse_ubatch = false;
|
||||
} else {
|
||||
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
||||
can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!can_reuse_ubatch) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return
|
||||
cparams.embeddings == other.cparams.embeddings &&
|
||||
cparams.causal_attn == other.cparams.causal_attn &&
|
||||
arch == other.arch &&
|
||||
gtype == other.gtype &&
|
||||
cvec == other.cvec &&
|
||||
loras == other.loras &&
|
||||
cross == other.cross &&
|
||||
n_outputs == other.n_outputs;
|
||||
}
|
||||
};
|
||||
|
||||
class llm_graph_result {
|
||||
public:
|
||||
llm_graph_result(int64_t max_nodes);
|
||||
|
||||
virtual ~llm_graph_result() = default;
|
||||
|
||||
ggml_tensor * get_tokens() const { return t_tokens; }
|
||||
ggml_tensor * get_logits() const { return t_logits; }
|
||||
ggml_tensor * get_embd() const { return t_embd; }
|
||||
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
||||
|
||||
ggml_cgraph * get_gf() const { return gf; }
|
||||
ggml_context * get_ctx() const { return ctx_compute.get(); }
|
||||
|
||||
int64_t get_max_nodes() const;
|
||||
|
||||
void reset();
|
||||
|
||||
void set_inputs(const llama_ubatch * ubatch);
|
||||
|
||||
// try to update the existing graph result using the new graph parameters in order to reuse it
|
||||
// this can only be done if we determine that the resulting graph using the new graph parameters
|
||||
// would be identical to the existing graph. in that case, we simply have to update the memory
|
||||
// contexts of the input tensors of the graph and we can reuse it for another computation
|
||||
// return true if the graph was updated and can be reused
|
||||
bool can_reuse(const llm_graph_params & params);
|
||||
|
||||
llm_graph_input_i * add_input(llm_graph_input_ptr input);
|
||||
|
||||
void set_params(const llm_graph_params & params);
|
||||
|
||||
// important graph nodes
|
||||
ggml_tensor * t_tokens = nullptr;
|
||||
ggml_tensor * t_logits = nullptr;
|
||||
ggml_tensor * t_embd = nullptr;
|
||||
ggml_tensor * t_embd_pooled = nullptr;
|
||||
|
||||
std::vector<llm_graph_input_ptr> inputs;
|
||||
|
||||
ggml_context_ptr ctx_compute;
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
std::vector<uint8_t> buf_compute_meta;
|
||||
|
||||
ggml_cgraph * gf;
|
||||
|
||||
int64_t max_nodes;
|
||||
|
||||
private:
|
||||
// keep a copy of the previous graph parameters
|
||||
// we will use this to determine whether the graph can be reused by comparing them with the new parameters
|
||||
// note: these are updated after constructing the new graph
|
||||
llm_graph_params params;
|
||||
|
||||
// env: LLAMA_GRAPH_RESULT_DEBUG
|
||||
int debug = 0;
|
||||
};
|
||||
|
||||
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
|
||||
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
|
||||
// used in build_rs to properly order writes and avoid unnecessary copies
|
||||
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
|
||||
|
||||
@ -463,8 +546,6 @@ struct llm_graph_context {
|
||||
const enum llama_pooling_type pooling_type;
|
||||
const enum llama_rope_type rope_type;
|
||||
|
||||
ggml_context * ctx0 = nullptr;
|
||||
|
||||
ggml_backend_sched_t sched;
|
||||
|
||||
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
||||
@ -476,7 +557,10 @@ struct llm_graph_context {
|
||||
|
||||
const llm_graph_cb & cb_func;
|
||||
|
||||
std::unique_ptr<llm_graph_result> res;
|
||||
llm_graph_result * res;
|
||||
|
||||
ggml_context * ctx0 = nullptr;
|
||||
ggml_cgraph * gf = nullptr;
|
||||
|
||||
llm_graph_context(const llm_graph_params & params);
|
||||
virtual ~llm_graph_context() = default;
|
||||
@ -562,7 +646,6 @@ struct llm_graph_context {
|
||||
//
|
||||
|
||||
ggml_tensor * build_attn_mha(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
||||
@ -575,7 +658,6 @@ struct llm_graph_context {
|
||||
|
||||
ggml_tensor * build_attn(
|
||||
llm_graph_input_attn_no_cache * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
@ -590,7 +672,6 @@ struct llm_graph_context {
|
||||
|
||||
ggml_tensor * build_attn(
|
||||
llm_graph_input_attn_kv_unified * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
@ -606,7 +687,6 @@ struct llm_graph_context {
|
||||
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
|
||||
ggml_tensor * build_attn(
|
||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
@ -621,7 +701,6 @@ struct llm_graph_context {
|
||||
|
||||
ggml_tensor * build_attn(
|
||||
llm_graph_input_attn_cross * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
@ -643,7 +722,6 @@ struct llm_graph_context {
|
||||
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
|
||||
// `llama_memory_recurrent`
|
||||
ggml_tensor * build_rs(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
ggml_tensor * state_copy,
|
||||
int32_t state_size,
|
||||
@ -658,7 +736,6 @@ struct llm_graph_context {
|
||||
|
||||
ggml_tensor * build_rs(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * s,
|
||||
int32_t state_size,
|
||||
int32_t n_seqs,
|
||||
@ -666,7 +743,6 @@ struct llm_graph_context {
|
||||
|
||||
ggml_tensor * build_rwkv_token_shift_load(
|
||||
llm_graph_input_rs * inp,
|
||||
ggml_cgraph * gf,
|
||||
const llama_ubatch & ubatch,
|
||||
int il) const;
|
||||
|
||||
@ -685,7 +761,6 @@ struct llm_graph_context {
|
||||
//
|
||||
|
||||
void build_pooling(
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * cls,
|
||||
ggml_tensor * cls_b,
|
||||
ggml_tensor * cls_out,
|
||||
|
@ -65,6 +65,46 @@ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
|
||||
return n_embd_head_v * n_head_kv;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_n_embd_k_gqa_variable() const {
|
||||
const uint32_t val = n_embd_k_gqa();
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
if (val != n_embd_k_gqa(il)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool llama_hparams::is_n_embd_v_gqa_variable() const {
|
||||
const uint32_t val = n_embd_v_gqa();
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
if (val != n_embd_v_gqa(il)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_k_gqa_max() const {
|
||||
uint32_t val = n_embd_k_gqa();
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
val = std::max(val, n_embd_k_gqa(il));
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_v_gqa_max() const {
|
||||
uint32_t val = n_embd_v_gqa();
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
val = std::max(val, n_embd_v_gqa(il));
|
||||
}
|
||||
|
||||
return val;
|
||||
}
|
||||
|
||||
uint32_t llama_hparams::n_embd_r() const {
|
||||
if (wkv_head_size != 0) {
|
||||
// for RWKV models
|
||||
|
@ -6,7 +6,7 @@
|
||||
|
||||
// bump if necessary
|
||||
#define LLAMA_MAX_LAYERS 512
|
||||
#define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
|
||||
#define LLAMA_MAX_EXPERTS 384 // Kimi-K2
|
||||
|
||||
enum llama_expert_gating_func_type {
|
||||
LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
|
||||
@ -98,7 +98,7 @@ struct llama_hparams {
|
||||
float rope_freq_scale_train;
|
||||
float rope_freq_scale_train_swa;
|
||||
uint32_t n_ctx_orig_yarn;
|
||||
float rope_yarn_log_mul;
|
||||
float rope_yarn_log_mul = 0.0f;
|
||||
|
||||
std::array<int, 4> rope_sections;
|
||||
|
||||
@ -191,6 +191,14 @@ struct llama_hparams {
|
||||
// dimension of value embeddings across all k-v heads
|
||||
uint32_t n_embd_v_gqa(uint32_t il = 0) const;
|
||||
|
||||
// true if any layer has a different n_embd_k_gqa/n_embd_v_gqa
|
||||
bool is_n_embd_k_gqa_variable() const;
|
||||
bool is_n_embd_v_gqa_variable() const;
|
||||
|
||||
// return the maximum n_embd_k_gqa/n_embd_v_gqa across all layers
|
||||
uint32_t n_embd_k_gqa_max() const;
|
||||
uint32_t n_embd_v_gqa_max() const;
|
||||
|
||||
// dimension of the rolling state embeddings
|
||||
// corresponds to Mamba's conv_states size or RWKV's token_shift states size
|
||||
uint32_t n_embd_r() const;
|
||||
|
@ -18,16 +18,17 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
bool swa_full,
|
||||
bool unified,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_ubatch,
|
||||
uint32_t n_pad) : hparams(model.hparams) {
|
||||
uint32_t n_pad) : hparams(model.hparams), unified(unified) {
|
||||
llama_kv_cache_unified::layer_filter_cb filter_base = [&](int32_t il) { return !model.hparams.is_swa(il); };
|
||||
llama_kv_cache_unified::layer_filter_cb filter_swa = [&](int32_t il) { return model.hparams.is_swa(il); };
|
||||
|
||||
const uint32_t size_base = kv_size;
|
||||
|
||||
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*n_seq_max + n_ubatch, n_pad));
|
||||
uint32_t size_swa = std::min(size_base, GGML_PAD(hparams.n_swa*(unified ? n_seq_max : 1) + n_ubatch, n_pad));
|
||||
|
||||
// when using full-size SWA cache, we set the SWA cache size to be equal to the base cache size
|
||||
if (swa_full) {
|
||||
@ -41,14 +42,14 @@ llama_kv_cache_unified_iswa::llama_kv_cache_unified_iswa(
|
||||
|
||||
kv_base = std::make_unique<llama_kv_cache_unified>(
|
||||
model, std::move(filter_base), type_k, type_v,
|
||||
v_trans, offload, size_base, n_seq_max, n_pad,
|
||||
v_trans, offload, unified, size_base, n_seq_max, n_pad,
|
||||
0, LLAMA_SWA_TYPE_NONE);
|
||||
|
||||
LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);
|
||||
|
||||
kv_swa = std::make_unique<llama_kv_cache_unified>(
|
||||
model, std::move(filter_swa), type_k, type_v,
|
||||
v_trans, offload, size_swa, n_seq_max, n_pad,
|
||||
v_trans, offload, unified, size_swa, n_seq_max, n_pad,
|
||||
hparams.n_swa, hparams.swa_type);
|
||||
}
|
||||
|
||||
@ -100,6 +101,11 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
||||
|
||||
// first try simple split
|
||||
do {
|
||||
if (!unified) {
|
||||
// requires equal splits, so we skip the simple split
|
||||
break;
|
||||
}
|
||||
|
||||
balloc.split_reset();
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
@ -140,7 +146,7 @@ llama_memory_context_ptr llama_kv_cache_unified_iswa::init_batch(llama_batch_all
|
||||
|
||||
std::vector<llama_ubatch> ubatches;
|
||||
while (true) {
|
||||
auto ubatch = balloc.split_equal(n_ubatch, false);
|
||||
auto ubatch = balloc.split_equal(n_ubatch, !unified);
|
||||
|
||||
if (ubatch.n_tokens == 0) {
|
||||
break;
|
||||
|
@ -20,6 +20,7 @@ public:
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
bool swa_full,
|
||||
bool unified,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_ubatch,
|
||||
@ -68,6 +69,8 @@ public:
|
||||
private:
|
||||
const llama_hparams & hparams;
|
||||
|
||||
const bool unified;
|
||||
|
||||
std::unique_ptr<llama_kv_cache_unified> kv_base;
|
||||
std::unique_ptr<llama_kv_cache_unified> kv_swa;
|
||||
};
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -35,16 +35,50 @@ public:
|
||||
std::vector<uint32_t> ids;
|
||||
};
|
||||
|
||||
struct stream_copy_info {
|
||||
bool empty() const {
|
||||
assert(ssrc.size() == sdst.size());
|
||||
return ssrc.empty();
|
||||
}
|
||||
|
||||
std::vector<uint32_t> ssrc;
|
||||
std::vector<uint32_t> sdst;
|
||||
};
|
||||
|
||||
// for each ubatch, create a slot_info that contains information about where the ubatch should be inserted in the
|
||||
// KV cells. for example, cell indices for each token, such that: token[i] -> goes to cells[idxs[i]]
|
||||
struct slot_info {
|
||||
// data for ggml_set_rows
|
||||
using idx_vec_t = std::vector<uint32_t>;
|
||||
|
||||
idx_vec_t idxs;
|
||||
// number of streams: ns = s1 - s0 + 1
|
||||
llama_seq_id s0;
|
||||
llama_seq_id s1;
|
||||
|
||||
std::vector<llama_seq_id> strm; // [ns]
|
||||
std::vector<idx_vec_t> idxs; // [ns]
|
||||
|
||||
uint32_t head() const {
|
||||
return idxs.at(0);
|
||||
GGML_ASSERT(idxs.size() == 1);
|
||||
GGML_ASSERT(!idxs[0].empty());
|
||||
|
||||
return idxs[0][0];
|
||||
}
|
||||
|
||||
void resize(size_t n) {
|
||||
strm.resize(n);
|
||||
idxs.resize(n);
|
||||
}
|
||||
|
||||
size_t size() const {
|
||||
GGML_ASSERT(idxs.size() == strm.size());
|
||||
GGML_ASSERT(!idxs.empty());
|
||||
|
||||
return idxs[0].size();
|
||||
}
|
||||
|
||||
size_t n_stream() const {
|
||||
return strm.size();
|
||||
}
|
||||
|
||||
bool empty() const {
|
||||
@ -54,9 +88,6 @@ public:
|
||||
void clear() {
|
||||
idxs.clear();
|
||||
}
|
||||
|
||||
// TODO: implement
|
||||
//std::vector<idx_vec_t> seq_idxs;
|
||||
};
|
||||
|
||||
using slot_info_vec_t = std::vector<slot_info>;
|
||||
@ -68,6 +99,7 @@ public:
|
||||
ggml_type type_v,
|
||||
bool v_trans,
|
||||
bool offload,
|
||||
bool unified,
|
||||
uint32_t kv_size,
|
||||
uint32_t n_seq_max,
|
||||
uint32_t n_pad,
|
||||
@ -112,6 +144,7 @@ public:
|
||||
//
|
||||
|
||||
uint32_t get_size() const;
|
||||
uint32_t get_n_stream() const;
|
||||
|
||||
bool get_has_shift() const;
|
||||
|
||||
@ -121,9 +154,12 @@ public:
|
||||
|
||||
uint32_t get_n_kv() const;
|
||||
|
||||
// TODO: temporary
|
||||
bool get_supports_set_rows() const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv) const;
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
|
||||
|
||||
// store k_cur and v_cur in the cache based on the provided head location
|
||||
ggml_tensor * cpy_k(ggml_context * ctx, ggml_tensor * k_cur, ggml_tensor * k_idxs, int32_t il, const slot_info & sinfo) const;
|
||||
@ -137,7 +173,7 @@ public:
|
||||
// return empty vector on failure
|
||||
slot_info_vec_t prepare(const std::vector<llama_ubatch> & ubatches);
|
||||
|
||||
bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo);
|
||||
bool update(llama_context * lctx, bool do_shift, const defrag_info & dinfo, const stream_copy_info & sc_info);
|
||||
|
||||
// find a slot of kv cells that can hold the ubatch
|
||||
// if cont == true, then the slot must be continuous
|
||||
@ -157,8 +193,9 @@ public:
|
||||
void set_input_k_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
|
||||
void set_input_v_idxs(ggml_tensor * dst, const llama_ubatch * ubatch, const slot_info & sinfo) const;
|
||||
|
||||
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
|
||||
void set_input_k_shift(ggml_tensor * dst) const;
|
||||
|
||||
void set_input_kq_mask (ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const;
|
||||
void set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const;
|
||||
|
||||
private:
|
||||
@ -172,15 +209,15 @@ private:
|
||||
|
||||
ggml_tensor * k;
|
||||
ggml_tensor * v;
|
||||
|
||||
std::vector<ggml_tensor *> k_stream;
|
||||
std::vector<ggml_tensor *> v_stream;
|
||||
};
|
||||
|
||||
bool v_trans = true; // the value tensor is transposed
|
||||
|
||||
// the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
|
||||
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
|
||||
uint32_t head = 0;
|
||||
|
||||
const uint32_t n_seq_max = 1;
|
||||
const uint32_t n_stream = 1;
|
||||
|
||||
// required padding
|
||||
const uint32_t n_pad = 1;
|
||||
@ -193,14 +230,24 @@ private:
|
||||
|
||||
// env: LLAMA_SET_ROWS (temporary)
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||
int supports_set_rows = false;
|
||||
bool supports_set_rows = false;
|
||||
|
||||
const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;
|
||||
|
||||
std::vector<ggml_context_ptr> ctxs;
|
||||
std::vector<ggml_backend_buffer_ptr> bufs;
|
||||
|
||||
llama_kv_cells_unified cells;
|
||||
// the current index from where we start searching for a free slot in the ring buffer of KV cells (see find_slot())
|
||||
// note: this is not part of the KV state and it's only used to speed-up the find_slot() method
|
||||
std::vector<uint32_t> v_heads;
|
||||
|
||||
std::vector<llama_kv_cells_unified> v_cells;
|
||||
|
||||
// maps from a sequence id to a stream id
|
||||
std::vector<uint32_t> seq_to_stream;
|
||||
|
||||
// pending stream copies that will be applied during the next update
|
||||
stream_copy_info sc_info;
|
||||
|
||||
std::vector<kv_layer> layers;
|
||||
|
||||
@ -226,22 +273,26 @@ private:
|
||||
float freq_base,
|
||||
float freq_scale) const;
|
||||
|
||||
llm_graph_result_ptr build_graph_shift(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf) const;
|
||||
ggml_cgraph * build_graph_shift(
|
||||
llm_graph_result * res,
|
||||
llama_context * lctx) const;
|
||||
|
||||
llm_graph_result_ptr build_graph_defrag(
|
||||
const llama_cparams & cparams,
|
||||
ggml_context * ctx,
|
||||
ggml_cgraph * gf,
|
||||
ggml_cgraph * build_graph_defrag(
|
||||
llm_graph_result * res,
|
||||
llama_context * lctx,
|
||||
const defrag_info & dinfo) const;
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
|
||||
struct cell_ranges_t {
|
||||
uint32_t strm;
|
||||
|
||||
bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
|
||||
std::vector<std::pair<uint32_t, uint32_t>> data; // ranges, from inclusive, to exclusive
|
||||
};
|
||||
|
||||
void state_write_meta(llama_io_write_i & io, const cell_ranges_t & cr, llama_seq_id seq_id = -1) const;
|
||||
void state_write_data(llama_io_write_i & io, const cell_ranges_t & cr) const;
|
||||
|
||||
bool state_read_meta(llama_io_read_i & io, uint32_t strm, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
|
||||
bool state_read_data(llama_io_read_i & io, uint32_t strm, uint32_t cell_count);
|
||||
};
|
||||
|
||||
class llama_kv_cache_unified_context : public llama_memory_context_i {
|
||||
@ -249,6 +300,7 @@ public:
|
||||
// some shorthands
|
||||
using slot_info_vec_t = llama_kv_cache_unified::slot_info_vec_t;
|
||||
using defrag_info = llama_kv_cache_unified::defrag_info;
|
||||
using stream_copy_info = llama_kv_cache_unified::stream_copy_info;
|
||||
|
||||
// used for errors
|
||||
llama_kv_cache_unified_context(llama_memory_status status);
|
||||
@ -262,7 +314,8 @@ public:
|
||||
llama_kv_cache_unified * kv,
|
||||
llama_context * lctx,
|
||||
bool do_shift,
|
||||
defrag_info dinfo);
|
||||
defrag_info dinfo,
|
||||
stream_copy_info sc_info);
|
||||
|
||||
// used to create a batch procesing context from a batch
|
||||
llama_kv_cache_unified_context(
|
||||
@ -288,6 +341,9 @@ public:
|
||||
|
||||
uint32_t get_n_kv() const;
|
||||
|
||||
// TODO: temporary
|
||||
bool get_supports_set_rows() const;
|
||||
|
||||
// get views of the current state of the cache
|
||||
ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
|
||||
ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;
|
||||
@ -320,6 +376,8 @@ private:
|
||||
|
||||
defrag_info dinfo;
|
||||
|
||||
stream_copy_info sc_info;
|
||||
|
||||
//
|
||||
// batch processing context
|
||||
//
|
||||
|
@ -38,6 +38,7 @@ llama_memory_hybrid::llama_memory_hybrid(
|
||||
type_v,
|
||||
v_trans,
|
||||
offload,
|
||||
1,
|
||||
kv_size,
|
||||
n_seq_max,
|
||||
n_pad,
|
||||
|
@ -446,7 +446,7 @@ bool llama_memory_recurrent::find_slot(const llama_ubatch & ubatch) {
|
||||
// A slot should be always be contiguous.
|
||||
|
||||
// can only process batches with an equal number of new tokens in each sequence
|
||||
GGML_ASSERT(ubatch.equal_seqs);
|
||||
GGML_ASSERT(ubatch.equal_seqs());
|
||||
|
||||
int32_t min = size - 1;
|
||||
int32_t max = 0;
|
||||
@ -768,6 +768,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
// Iterate and write all the keys first, each row is a cell
|
||||
// Get whole range at a time
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (r_l[il] == nullptr) continue;
|
||||
|
||||
// Write key type
|
||||
const int32_t r_type_i = (int32_t)r_l[il]->type;
|
||||
@ -787,6 +789,8 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
|
||||
if (!s_trans) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
// Write value type
|
||||
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
||||
@ -807,6 +811,9 @@ void llama_memory_recurrent::state_write_data(llama_io_write_i & io, const std::
|
||||
// When v is transposed, we also need the element size and get the element ranges from each row
|
||||
const uint32_t mem_size = size;
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers (read_data will handle this by checking "r_l" and "s_l" for null)
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
const uint32_t n_embd_s = hparams.n_embd_s();
|
||||
|
||||
// Write value type
|
||||
@ -951,6 +958,8 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
|
||||
// For each layer, read the keys for each cell, one row is one cell, read as one contiguous block
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers
|
||||
if (r_l[il] == nullptr) continue;
|
||||
|
||||
// Read type of key
|
||||
int32_t r_type_i_ref;
|
||||
@ -978,11 +987,14 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
|
||||
if (!s_trans) {
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
// Read type of value
|
||||
int32_t s_type_i_ref;
|
||||
io.read_to(&s_type_i_ref, sizeof(s_type_i_ref));
|
||||
const int32_t s_type_i = (int32_t)s_l[il]->type;
|
||||
|
||||
if (s_type_i != s_type_i_ref) {
|
||||
LLAMA_LOG_ERROR("%s: mismatched s type (%d != %d, layer %d)\n", __func__, s_type_i, s_type_i_ref, il);
|
||||
return false;
|
||||
@ -1005,6 +1017,9 @@ bool llama_memory_recurrent::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
} else {
|
||||
// For each layer, read the values for each cell (transposed)
|
||||
for (uint32_t il = 0; il < n_layer; ++il) {
|
||||
// skip null layers
|
||||
if (s_l[il] == nullptr) continue;
|
||||
|
||||
const uint32_t n_embd_s = hparams.n_embd_s();
|
||||
|
||||
// Read type of value
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -99,8 +99,10 @@ enum llm_type {
|
||||
LLM_TYPE_17B_16E, // llama4 Scout
|
||||
LLM_TYPE_17B_128E, // llama4 Maverick
|
||||
LLM_TYPE_A13B,
|
||||
LLM_TYPE_21B_A3B, // Ernie MoE small
|
||||
LLM_TYPE_30B_A3B,
|
||||
LLM_TYPE_235B_A22B,
|
||||
LLM_TYPE_300B_A47B, // Ernie MoE big
|
||||
LLM_TYPE_E2B,
|
||||
LLM_TYPE_E4B,
|
||||
};
|
||||
@ -452,10 +454,7 @@ struct llama_model {
|
||||
llama_memory_i * create_memory(const llama_memory_params & params, llama_cparams & cparams) const;
|
||||
|
||||
// TODO: move this to new llm_arch_model_i interface
|
||||
llm_graph_result_ptr build_graph(
|
||||
const llm_graph_params & params,
|
||||
ggml_cgraph * gf,
|
||||
llm_graph_type type) const;
|
||||
ggml_cgraph * build_graph(const llm_graph_params & params) const;
|
||||
|
||||
private:
|
||||
struct impl;
|
||||
|
@ -884,8 +884,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
|
||||
if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
|
||||
if (qtype != new_type) {
|
||||
LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
|
||||
new_type = qtype;
|
||||
break; // if two or more types are specified for the tensor, first match wins
|
||||
new_type = qtype; // if two or more types are specified for the same tensor, the last match wins
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -11,6 +11,7 @@
|
||||
#include <cassert>
|
||||
#include <cctype>
|
||||
#include <cfloat>
|
||||
#include <cmath>
|
||||
#include <cstdarg>
|
||||
#include <cstring>
|
||||
#include <forward_list>
|
||||
@ -404,6 +405,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
||||
"[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_KIMI_K2:
|
||||
regex_exprs = {
|
||||
// K2 trigger pattern - this will activate the custom K2 handler in unicode.cpp
|
||||
// The custom handler implements all K2 patterns with proper Han character exclusion
|
||||
"\\p{Han}+",
|
||||
};
|
||||
break;
|
||||
case LLAMA_VOCAB_PRE_TYPE_SUPERBPE:
|
||||
regex_exprs = {
|
||||
"\\p{N}+",
|
||||
@ -1196,6 +1204,284 @@ private:
|
||||
const llm_tokenizer_rwkv & tokenizer;
|
||||
};
|
||||
|
||||
struct llm_tokenizer_plamo2 : llm_tokenizer {
|
||||
llm_tokenizer_plamo2(const llama_vocab & vocab) {
|
||||
build(vocab);
|
||||
}
|
||||
|
||||
void build(const llama_vocab & vocab) {
|
||||
// Reset internal structures
|
||||
tokens_.clear();
|
||||
bytes_.assign(256, 0);
|
||||
to_suffix_id_.clear();
|
||||
table_.clear();
|
||||
|
||||
// Build token list and byte mapping
|
||||
std::unordered_map<std::string, float> suffix_to_score;
|
||||
std::unordered_map<std::string, llama_token> token_to_id;
|
||||
|
||||
for (size_t token_id = 0; token_id < vocab.n_tokens(); ++token_id) {
|
||||
const auto & entry = vocab.get_token_data(token_id);
|
||||
tokens_.push_back(entry.text);
|
||||
token_to_id[entry.text] = static_cast<llama_token>(token_id);
|
||||
|
||||
// Handle byte tokens
|
||||
if (vocab.is_byte(token_id)) {
|
||||
if (entry.text.length() == 6 && entry.text.substr(0, 3) == "<0x" && entry.text.back() == '>') {
|
||||
std::string hex_str = entry.text.substr(3, 2);
|
||||
int byte_val = std::stoi(hex_str, nullptr, 16);
|
||||
bytes_[byte_val] = static_cast<llama_token>(token_id);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// Add token and all its suffixes to suffix_to_score
|
||||
suffix_to_score[entry.text] = entry.score;
|
||||
|
||||
// Extract suffixes character by character (UTF-8 aware)
|
||||
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(entry.text);
|
||||
for (size_t i = 1; i < cpts.size(); ++i) {
|
||||
std::string suffix;
|
||||
for (size_t j = i; j < cpts.size(); ++j) {
|
||||
suffix += unicode_cpt_to_utf8(cpts[j]);
|
||||
}
|
||||
if (suffix_to_score.find(suffix) == suffix_to_score.end()) {
|
||||
suffix_to_score[suffix] = std::numeric_limits<float>::quiet_NaN();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Check that all byte tokens are set
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
if (bytes_[i] == 0) {
|
||||
throw std::runtime_error("Byte token for <0x" + std::to_string(i) + "> is not set");
|
||||
}
|
||||
}
|
||||
|
||||
// Build suffix list in lexicographical order of reversed strings
|
||||
std::vector<std::string> suffixes;
|
||||
for (const auto & pair : suffix_to_score) {
|
||||
suffixes.push_back(pair.first);
|
||||
}
|
||||
suffixes.push_back(""); // Empty suffix
|
||||
|
||||
std::sort(suffixes.begin(), suffixes.end(), [](const std::string & a, const std::string & b) {
|
||||
std::string rev_a(a.rbegin(), a.rend());
|
||||
std::string rev_b(b.rbegin(), b.rend());
|
||||
return rev_a < rev_b;
|
||||
});
|
||||
|
||||
// Build suffix_to_id and to_suffix_id_
|
||||
std::unordered_map<std::string, int32_t> suffix_to_id;
|
||||
int32_t num_pieces = 0;
|
||||
|
||||
for (const auto & suffix : suffixes) {
|
||||
suffix_to_id[suffix] = num_pieces;
|
||||
if (!suffix.empty()) {
|
||||
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
|
||||
|
||||
std::string remaining;
|
||||
for (size_t i = 1; i < cpts.size(); ++i) {
|
||||
remaining += unicode_cpt_to_utf8(cpts[i]);
|
||||
}
|
||||
|
||||
int64_t piece_code = (static_cast<int64_t>(cpts[0]) << 32) | suffix_to_id[remaining];
|
||||
to_suffix_id_[piece_code] = num_pieces;
|
||||
|
||||
// Count number of pieces for this suffix
|
||||
int32_t pieces_for_suffix = 1; // sentinel row
|
||||
for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
|
||||
std::string piece;
|
||||
for (int32_t i = 0; i < piece_length; ++i) {
|
||||
piece += unicode_cpt_to_utf8(cpts[i]);
|
||||
}
|
||||
if (suffix_to_score.find(piece) != suffix_to_score.end()) {
|
||||
pieces_for_suffix++;
|
||||
}
|
||||
}
|
||||
num_pieces += pieces_for_suffix;
|
||||
} else {
|
||||
num_pieces++; // Empty suffix contributes one piece (sentinel row)
|
||||
}
|
||||
}
|
||||
|
||||
// Build flattened table
|
||||
table_.resize(num_pieces, std::vector<int32_t>(4, 0));
|
||||
int32_t table_idx = 0;
|
||||
|
||||
for (const auto & suffix : suffixes) {
|
||||
// Add all prefixes of the suffix to the table (in decreasing order of length)
|
||||
std::vector<uint32_t> cpts = unicode_cpts_from_utf8(suffix);
|
||||
for (int32_t piece_length = static_cast<int32_t>(cpts.size()); piece_length > 0; --piece_length) {
|
||||
std::string piece;
|
||||
for (int32_t i = 0; i < piece_length; ++i) {
|
||||
piece += unicode_cpt_to_utf8(cpts[i]);
|
||||
}
|
||||
|
||||
auto score_it = suffix_to_score.find(piece);
|
||||
if (score_it == suffix_to_score.end()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
table_[table_idx][TABLE_PIECE_LENGTH] = piece_length;
|
||||
auto token_it = token_to_id.find(piece);
|
||||
table_[table_idx][TABLE_TOKEN_ID] = (token_it != token_to_id.end()) ? token_it->second : -1;
|
||||
|
||||
float score = score_it->second;
|
||||
table_[table_idx][TABLE_SCORE] = std::isfinite(score) ?
|
||||
static_cast<int32_t>(std::round(score * 1e4)) : INVALID_SCORE;
|
||||
table_[table_idx][TABLE_PIECE_ID] = suffix_to_id[piece];
|
||||
|
||||
table_idx++;
|
||||
}
|
||||
|
||||
// Add sentinel row
|
||||
table_[table_idx][TABLE_PIECE_LENGTH] = 1;
|
||||
table_[table_idx][TABLE_TOKEN_ID] = -1;
|
||||
table_[table_idx][TABLE_SCORE] = UNKNOWN_SCORE;
|
||||
table_idx++;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<llama_token> encode(const std::string & text) const {
|
||||
std::vector<uint32_t> unicode_data = unicode_cpts_from_utf8(text);
|
||||
// Skip the first code point if it is a BOM (Byte Order Mark)
|
||||
if (!unicode_data.empty() && unicode_data[0] == 0xFEFF) {
|
||||
unicode_data.erase(unicode_data.begin());
|
||||
}
|
||||
|
||||
if (unicode_data.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
const size_t data_len = unicode_data.size();
|
||||
|
||||
// Initialize scores array (dynamic programming)
|
||||
std::vector<int64_t> scores(data_len + 1, static_cast<int64_t>(1) << 60);
|
||||
scores[data_len] = 0;
|
||||
|
||||
// Path array to track best tokenization
|
||||
std::vector<std::vector<int32_t>> path(data_len + 1, std::vector<int32_t>(3, 0));
|
||||
|
||||
int32_t suffix_id = 0;
|
||||
|
||||
// Process from end to beginning
|
||||
for (int i = static_cast<int>(data_len) - 1; i >= 0; --i) {
|
||||
uint32_t c = unicode_data[i];
|
||||
|
||||
// Find next suffix ID
|
||||
for (size_t p = suffix_id; p < table_.size(); ++p) {
|
||||
int64_t piece_code = (static_cast<int64_t>(c) << 32) | table_[p][TABLE_PIECE_ID];
|
||||
auto it = to_suffix_id_.find(piece_code);
|
||||
suffix_id = (it != to_suffix_id_.end()) ? it->second : 0;
|
||||
|
||||
if (suffix_id > 0 || table_[p][TABLE_SCORE] == UNKNOWN_SCORE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Update best path
|
||||
for (size_t p = suffix_id; p < table_.size(); ++p) {
|
||||
int32_t score = table_[p][TABLE_SCORE];
|
||||
if (score > INVALID_SCORE) {
|
||||
int32_t piece_length = table_[p][TABLE_PIECE_LENGTH];
|
||||
int64_t s = scores[i + piece_length] - score;
|
||||
|
||||
if (s < scores[i]) {
|
||||
scores[i] = s;
|
||||
path[i][PATH_TOKEN_LENGTH] = piece_length;
|
||||
path[i][PATH_TOKEN_ID] = table_[p][TABLE_TOKEN_ID];
|
||||
path[i][PATH_NUM_TOKENS] = path[i + piece_length][PATH_NUM_TOKENS] + 1;
|
||||
|
||||
if (score == UNKNOWN_SCORE) {
|
||||
// Add UTF-8 byte count
|
||||
path[i][PATH_NUM_TOKENS] += (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (score == UNKNOWN_SCORE) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Decode the best path
|
||||
std::vector<llama_token> token_ids;
|
||||
token_ids.reserve(path[0][PATH_NUM_TOKENS]);
|
||||
|
||||
int pos = 0;
|
||||
while (pos < static_cast<int>(data_len)) {
|
||||
if (path[pos][PATH_TOKEN_ID] >= 0) {
|
||||
token_ids.push_back(path[pos][PATH_TOKEN_ID]);
|
||||
} else {
|
||||
// Fall back to byte tokens
|
||||
uint32_t c = unicode_data[pos];
|
||||
int s = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
|
||||
|
||||
for (int i = 0; i < s; ++i) {
|
||||
uint8_t b;
|
||||
if (s == 1) {
|
||||
b = c;
|
||||
} else {
|
||||
if (i == 0) {
|
||||
b = (0xF00 >> s) & 0xFF;
|
||||
} else {
|
||||
b = 0x80;
|
||||
}
|
||||
}
|
||||
token_ids.push_back(bytes_[b | ((c >> ((s - i - 1) * 6)) & 0x3F)]);
|
||||
}
|
||||
}
|
||||
|
||||
assert(path[pos][PATH_TOKEN_LENGTH] > 0);
|
||||
pos += path[pos][PATH_TOKEN_LENGTH];
|
||||
}
|
||||
|
||||
return token_ids;
|
||||
}
|
||||
private:
|
||||
// Constants for table structure
|
||||
static constexpr int32_t TABLE_PIECE_LENGTH = 0;
|
||||
static constexpr int32_t TABLE_TOKEN_ID = 1;
|
||||
static constexpr int32_t TABLE_SCORE = 2;
|
||||
static constexpr int32_t TABLE_PIECE_ID = 3;
|
||||
|
||||
// Constants for path array
|
||||
static constexpr int32_t PATH_TOKEN_LENGTH = 0;
|
||||
static constexpr int32_t PATH_TOKEN_ID = 1;
|
||||
static constexpr int32_t PATH_NUM_TOKENS = 2;
|
||||
|
||||
// Score constants
|
||||
static constexpr int32_t INVALID_SCORE = -20000000;
|
||||
static constexpr int32_t UNKNOWN_SCORE = -10000000;
|
||||
|
||||
// List of tokens in the vocabulary
|
||||
std::vector<std::string> tokens_;
|
||||
|
||||
// Mapping from byte code point to token ID (for byte fallback)
|
||||
std::vector<llama_token> bytes_;
|
||||
|
||||
// Mapping from piece code to suffix ID
|
||||
std::unordered_map<int64_t, int32_t> to_suffix_id_;
|
||||
|
||||
// Flattened table representing the Trie structure
|
||||
// Each row contains: [piece_length, token_id, score, piece_id]
|
||||
std::vector<std::vector<int32_t>> table_;
|
||||
};
|
||||
|
||||
struct llm_tokenizer_plamo2_session {
|
||||
llm_tokenizer_plamo2_session(const llm_tokenizer_plamo2 & tokenizer) : tokenizer(tokenizer) {}
|
||||
|
||||
void tokenize(const std::string & text, std::vector<llama_token> & output) {
|
||||
std::vector<llama_token> tokens = tokenizer.encode(text);
|
||||
output.insert(output.end(), tokens.begin(), tokens.end());
|
||||
}
|
||||
|
||||
private:
|
||||
const llm_tokenizer_plamo2 & tokenizer;
|
||||
};
|
||||
|
||||
//
|
||||
// impl
|
||||
//
|
||||
@ -1499,6 +1785,16 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
special_unk_id = LLAMA_TOKEN_NULL;
|
||||
special_sep_id = LLAMA_TOKEN_NULL;
|
||||
special_pad_id = LLAMA_TOKEN_NULL;
|
||||
} else if (tokenizer_model == "plamo2") {
|
||||
type = LLAMA_VOCAB_TYPE_PLAMO2;
|
||||
|
||||
// PLaMo-2 default special tokens (these will be overridden by model config)
|
||||
special_bos_id = 1; // <|plamo:bos|>
|
||||
special_eos_id = 2; // <|plamo:eos|>
|
||||
special_unk_id = 0; // <|plamo:unk|>
|
||||
special_sep_id = LLAMA_TOKEN_NULL;
|
||||
special_pad_id = 3; // <|plamo:pad|>
|
||||
special_mask_id = LLAMA_TOKEN_NULL;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown tokenizer: '%s'", tokenizer_model.c_str()));
|
||||
}
|
||||
@ -1629,6 +1925,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
} else if (
|
||||
tokenizer_pre == "exaone") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_EXAONE;
|
||||
} else if (
|
||||
tokenizer_pre == "exaone4") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
|
||||
} else if (
|
||||
tokenizer_pre == "chameleon") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_CHAMELEON;
|
||||
@ -1665,6 +1964,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
||||
tokenizer_pre == "hunyuan") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_HUNYUAN;
|
||||
clean_spaces = false;
|
||||
} else if (
|
||||
tokenizer_pre == "kimi-k2") {
|
||||
pre_type = LLAMA_VOCAB_PRE_TYPE_KIMI_K2;
|
||||
clean_spaces = false;
|
||||
} else {
|
||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||
}
|
||||
@ -2151,6 +2454,7 @@ std::string llama_vocab::impl::type_name() const{
|
||||
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
||||
case LLAMA_VOCAB_TYPE_UGM: return "UGM";
|
||||
case LLAMA_VOCAB_TYPE_RWKV: return "RWKV";
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2: return "PLaMo2";
|
||||
default: return "unknown";
|
||||
}
|
||||
}
|
||||
@ -2234,6 +2538,9 @@ void llama_vocab::impl::init_tokenizer(enum llama_vocab_type type) {
|
||||
case LLAMA_VOCAB_TYPE_RWKV:
|
||||
tokenizer = std::make_unique<llm_tokenizer_rwkv>(vocab);
|
||||
break;
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2:
|
||||
tokenizer = std::make_unique<llm_tokenizer_plamo2>(vocab);
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("unsupported vocab type");
|
||||
}
|
||||
@ -2566,6 +2873,23 @@ std::vector<llama_token> llama_vocab::impl::tokenize(
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
||||
#endif
|
||||
|
||||
session.tokenize(text, output);
|
||||
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
|
||||
output.push_back(fragment.token);
|
||||
}
|
||||
}
|
||||
} break;
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2:
|
||||
{
|
||||
llm_tokenizer_plamo2_session session(*static_cast<const llm_tokenizer_plamo2 *>(tokenizer.get()));
|
||||
for (const auto & fragment : fragment_buffer) {
|
||||
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
|
||||
std::string text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
||||
|
||||
#ifdef PRETOKENIZERDEBUG
|
||||
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", text.length(), fragment.offset, fragment.length, text.c_str());
|
||||
#endif
|
||||
@ -2664,6 +2988,24 @@ int32_t llama_vocab::impl::token_to_piece(llama_token token, char * buf, int32_t
|
||||
memcpy(buf, result.data(), result.size());
|
||||
return (int)result.size();
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2: {
|
||||
// PLaMo-2 uses similar token handling as BPE/SPM
|
||||
if (vocab.is_byte(token)) {
|
||||
// Handle byte tokens like <0xXX>
|
||||
if (token_text.length() == 6 && token_text.substr(0, 3) == "<0x" && token_text.back() == '>') {
|
||||
int hex_val = std::stoi(token_text.substr(3, 2), nullptr, 16);
|
||||
if (length < 1) {
|
||||
return -1;
|
||||
}
|
||||
buf[0] = static_cast<char>(hex_val);
|
||||
return 1;
|
||||
}
|
||||
}
|
||||
|
||||
// Normal token - just copy the text
|
||||
std::string result = token_text;
|
||||
return _try_copy(result.data(), result.size());
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@ -2908,6 +3250,12 @@ llama_token llama_vocab::byte_to_token(uint8_t ch) const {
|
||||
case LLAMA_VOCAB_TYPE_BPE: {
|
||||
return pimpl->token_to_id.at(unicode_byte_to_utf8(ch));
|
||||
}
|
||||
case LLAMA_VOCAB_TYPE_PLAMO2: {
|
||||
// PLaMo-2 uses byte tokens in format <0xXX>
|
||||
char hex_str[8];
|
||||
snprintf(hex_str, sizeof(hex_str), "<0x%02X>", ch);
|
||||
return pimpl->token_to_id.at(hex_str);
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
@ -3009,6 +3357,10 @@ llama_token llama_vocab::token_fim_sep() const {
|
||||
return pimpl->special_fim_sep_id;
|
||||
}
|
||||
|
||||
llama_token llama_vocab::token_mask() const {
|
||||
return pimpl->special_mask_id;
|
||||
}
|
||||
|
||||
bool llama_vocab::get_add_space_prefix() const {
|
||||
return pimpl->add_space_prefix;
|
||||
}
|
||||
@ -3249,6 +3601,10 @@ llama_token llama_vocab_fim_sep(const struct llama_vocab * vocab) {
|
||||
return vocab->token_fim_sep();
|
||||
}
|
||||
|
||||
llama_token llama_vocab_mask(const struct llama_vocab* vocab) {
|
||||
return vocab->token_mask();
|
||||
}
|
||||
|
||||
// deprecated
|
||||
const char * llama_token_get_text(const struct llama_vocab * vocab, llama_token token) {
|
||||
return llama_vocab_get_text(vocab, token);
|
||||
@ -3385,4 +3741,3 @@ int32_t llama_detokenize(
|
||||
bool unparse_special) {
|
||||
return vocab->detokenize(tokens, n_tokens, text, text_len_max, remove_special, unparse_special);
|
||||
}
|
||||
|
||||
|
@ -45,6 +45,7 @@ enum llama_vocab_pre_type {
|
||||
LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
|
||||
LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
|
||||
LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
|
||||
LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
|
||||
};
|
||||
|
||||
struct LLM_KV;
|
||||
@ -100,6 +101,7 @@ struct llama_vocab {
|
||||
llama_token token_sep() const;
|
||||
llama_token token_nl () const;
|
||||
llama_token token_pad() const;
|
||||
llama_token token_mask() const;
|
||||
|
||||
llama_token token_prefix() const;
|
||||
llama_token token_middle() const;
|
||||
|
@ -77,6 +77,7 @@ extern "C" {
|
||||
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
|
||||
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
|
||||
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
|
||||
LLAMA_VOCAB_TYPE_PLAMO2 = 6, // PLaMo-2 tokenizer based on Aho-Corasick with dynamic programming
|
||||
};
|
||||
|
||||
enum llama_rope_type {
|
||||
@ -334,6 +335,9 @@ extern "C" {
|
||||
bool swa_full; // use full-size SWA cache (https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)
|
||||
// NOTE: setting to false when n_seq_max > 1 can cause bad performance in some cases
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573
|
||||
bool kv_unified; // use a unified buffer across the input sequences when computing the attention
|
||||
// try to disable when n_seq_max > 1 for improved performance when the sequences do not share a large prefix
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14363
|
||||
};
|
||||
|
||||
// model quantization parameters
|
||||
@ -724,7 +728,7 @@ extern "C" {
|
||||
// - lazily on next llama_decode()
|
||||
// p0 < 0 : [0, p1]
|
||||
// p1 < 0 : [p0, inf)
|
||||
DEPRECATED(void llama_kv_self_seq_div(
|
||||
DEPRECATED(LLAMA_API void llama_kv_self_seq_div(
|
||||
struct llama_context * ctx,
|
||||
llama_seq_id seq_id,
|
||||
llama_pos p0,
|
||||
@ -952,6 +956,7 @@ extern "C" {
|
||||
// in the order they have appeared in the batch.
|
||||
// Rows: number of tokens for which llama_batch.logits[i] != 0
|
||||
// Cols: n_vocab
|
||||
// TODO: deprecate in favor of llama_get_logits_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
||||
LLAMA_API float * llama_get_logits(struct llama_context * ctx);
|
||||
|
||||
// Logits for the ith token. For positive indices, Equivalent to:
|
||||
@ -966,6 +971,7 @@ extern "C" {
|
||||
// in the order they have appeared in the batch.
|
||||
// shape: [n_outputs*n_embd]
|
||||
// Otherwise, returns NULL.
|
||||
// TODO: deprecate in favor of llama_get_embeddings_ith() (ref: https://github.com/ggml-org/llama.cpp/pull/14853#issuecomment-3113143522)
|
||||
LLAMA_API float * llama_get_embeddings(struct llama_context * ctx);
|
||||
|
||||
// Get the embeddings for the ith token. For positive indices, Equivalent to:
|
||||
@ -1004,6 +1010,7 @@ extern "C" {
|
||||
LLAMA_API llama_token llama_vocab_sep(const struct llama_vocab * vocab); // sentence separator
|
||||
LLAMA_API llama_token llama_vocab_nl (const struct llama_vocab * vocab); // next-line
|
||||
LLAMA_API llama_token llama_vocab_pad(const struct llama_vocab * vocab); // padding
|
||||
LLAMA_API llama_token llama_vocab_mask(const struct llama_vocab * vocab); // mask
|
||||
|
||||
LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
|
||||
LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
|
||||
@ -1389,6 +1396,7 @@ extern "C" {
|
||||
|
||||
int32_t n_p_eval;
|
||||
int32_t n_eval;
|
||||
int32_t n_reused; // number of times a ggml compute graph had been reused
|
||||
};
|
||||
|
||||
struct llama_perf_sampler_data {
|
||||
|
@ -557,6 +557,178 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
|
||||
return bpe_offsets;
|
||||
}
|
||||
|
||||
// K2 system regex patterns (from tokenization_kimi.py):
|
||||
// [\p{Han}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+
|
||||
static std::vector<size_t> unicode_regex_split_custom_kimi_k2(const std::string & text, const std::vector<size_t> & offsets) {
|
||||
std::vector<size_t> bpe_offsets;
|
||||
bpe_offsets.reserve(offsets.size());
|
||||
|
||||
const auto cpts = unicode_cpts_from_utf8(text);
|
||||
|
||||
size_t start = 0;
|
||||
for (auto offset : offsets) {
|
||||
const size_t offset_ini = start;
|
||||
const size_t offset_end = start + offset;
|
||||
assert(offset_end <= cpts.size());
|
||||
start = offset_end;
|
||||
|
||||
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
|
||||
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
|
||||
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
|
||||
};
|
||||
|
||||
auto _get_flags = [&] (const size_t pos) -> unicode_cpt_flags {
|
||||
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags_from_cpt(cpts[pos]) : unicode_cpt_flags{};
|
||||
};
|
||||
|
||||
size_t _prev_end = offset_ini;
|
||||
auto _add_token = [&] (const size_t end) -> size_t {
|
||||
assert(_prev_end <= end && end <= offset_end);
|
||||
size_t len = end - _prev_end;
|
||||
if (len > 0) {
|
||||
bpe_offsets.push_back(len);
|
||||
}
|
||||
_prev_end = end;
|
||||
return len;
|
||||
};
|
||||
|
||||
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||
const uint32_t cpt = _get_cpt(pos);
|
||||
const auto flags = _get_flags(pos);
|
||||
|
||||
// Pattern 1: [\p{Han}]+ (Chinese characters)
|
||||
if (unicode_cpt_is_han(cpt)) {
|
||||
while (unicode_cpt_is_han(_get_cpt(pos))) {
|
||||
pos++;
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pattern 2 & 3: Letter words excluding Han characters with optional contractions
|
||||
// [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?:'s|'t|'re|'ve|'m|'ll|'d)?
|
||||
// [^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?:'s|'t|'re|'ve|'m|'ll|'d)?
|
||||
// Check if current char is a letter OR if current char could be a leading char and next char is a letter
|
||||
bool is_letter_pattern = (flags.is_letter && !unicode_cpt_is_han(cpt)) ||
|
||||
(!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number) &&
|
||||
_get_flags(pos + 1).is_letter && !unicode_cpt_is_han(_get_cpt(pos + 1)));
|
||||
|
||||
if (is_letter_pattern) {
|
||||
// Handle optional leading non-letter/non-number character
|
||||
bool has_leading_char = false;
|
||||
if (!(cpt == '\r' || cpt == '\n' || flags.is_letter || flags.is_number)) {
|
||||
has_leading_char = true;
|
||||
pos++;
|
||||
}
|
||||
|
||||
// Match letter sequence (excluding Han characters)
|
||||
bool has_letters = false;
|
||||
while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
|
||||
has_letters = true;
|
||||
pos++;
|
||||
}
|
||||
|
||||
// Only proceed if we found letters (after potentially skipping leading char)
|
||||
if (has_letters || (!has_leading_char && _get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos)))) {
|
||||
if (!has_letters) pos++; // consume the first letter if we didn't already
|
||||
|
||||
// Continue consuming letters
|
||||
while (_get_flags(pos).is_letter && !unicode_cpt_is_han(_get_cpt(pos))) {
|
||||
pos++;
|
||||
}
|
||||
|
||||
// Check for optional contractions (?:'s|'t|'re|'ve|'m|'ll|'d)
|
||||
if (_get_cpt(pos) == '\'' && pos + 1 < offset_end) {
|
||||
uint32_t cpt_next = unicode_tolower(_get_cpt(pos + 1));
|
||||
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
||||
pos += 2;
|
||||
} else if (pos + 2 < offset_end) {
|
||||
uint32_t cpt_next_next = unicode_tolower(_get_cpt(pos + 2));
|
||||
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
||||
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
||||
pos += 3;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
_add_token(pos);
|
||||
continue;
|
||||
} else if (has_leading_char) {
|
||||
// We consumed a leading char but found no letters, backtrack
|
||||
pos--;
|
||||
}
|
||||
}
|
||||
|
||||
// Pattern 4: \p{N}{1,3} (numbers 1-3 digits)
|
||||
if (flags.is_number) {
|
||||
size_t ini = pos;
|
||||
while (_get_flags(pos).is_number) {
|
||||
if (++pos - ini >= 3) {
|
||||
_add_token(pos);
|
||||
ini = pos;
|
||||
}
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pattern 5: ?[^\s\p{L}\p{N}]+[\r\n]* (optional space + non-word chars + optional newlines)
|
||||
auto flags2 = (cpt == ' ' ? _get_flags(pos + 1) : flags);
|
||||
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
|
||||
pos += (cpt == ' ');
|
||||
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number) && flags2.as_uint()) {
|
||||
flags2 = _get_flags(++pos);
|
||||
}
|
||||
// Match optional [\r\n]*
|
||||
uint32_t cpt2 = _get_cpt(pos);
|
||||
while (cpt2 == '\r' || cpt2 == '\n') {
|
||||
cpt2 = _get_cpt(++pos);
|
||||
}
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Count whitespace characters
|
||||
size_t num_whitespaces = 0;
|
||||
size_t last_end_r_or_n = 0;
|
||||
while (_get_flags(pos + num_whitespaces).is_whitespace) {
|
||||
uint32_t cpt2 = _get_cpt(pos + num_whitespaces);
|
||||
if (cpt2 == '\r' || cpt2 == '\n') {
|
||||
last_end_r_or_n = pos + num_whitespaces + 1;
|
||||
}
|
||||
num_whitespaces++;
|
||||
}
|
||||
|
||||
// Pattern 6: \s*[\r\n]+ (whitespace with newlines)
|
||||
if (last_end_r_or_n > 0) {
|
||||
pos = last_end_r_or_n;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pattern 7: \s+(?!\S) (trailing whitespace)
|
||||
if (num_whitespaces > 1 && _get_cpt(pos + num_whitespaces) != OUT_OF_RANGE) {
|
||||
pos += num_whitespaces - 1;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Pattern 8: \s+ (general whitespace)
|
||||
if (num_whitespaces > 0) {
|
||||
pos += num_whitespaces;
|
||||
_add_token(pos);
|
||||
continue;
|
||||
}
|
||||
|
||||
// No matches - consume single character
|
||||
_add_token(++pos);
|
||||
}
|
||||
}
|
||||
|
||||
return bpe_offsets;
|
||||
}
|
||||
|
||||
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
||||
std::vector<size_t> bpe_offsets;
|
||||
|
||||
@ -567,6 +739,9 @@ static std::vector<size_t> unicode_regex_split_custom(const std::string & text,
|
||||
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
||||
|
||||
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
|
||||
} else if (regex_expr == "\\p{Han}+") {
|
||||
// K2's first pattern - handle all K2 patterns together
|
||||
bpe_offsets = unicode_regex_split_custom_kimi_k2(text, offsets);
|
||||
}
|
||||
|
||||
return bpe_offsets;
|
||||
@ -672,6 +847,38 @@ uint32_t unicode_tolower(uint32_t cpt) {
|
||||
return cpt; // Return the original code point if no lowercase mapping is found
|
||||
}
|
||||
|
||||
bool unicode_cpt_is_han(uint32_t cpt) {
|
||||
// Han character ranges (Chinese/CJK characters)
|
||||
// CJK Unified Ideographs (most common)
|
||||
if (cpt >= 0x4E00 && cpt <= 0x9FFF) return true;
|
||||
|
||||
// CJK Extension A
|
||||
if (cpt >= 0x3400 && cpt <= 0x4DBF) return true;
|
||||
|
||||
// CJK Extension B
|
||||
if (cpt >= 0x20000 && cpt <= 0x2A6DF) return true;
|
||||
|
||||
// CJK Extension C
|
||||
if (cpt >= 0x2A700 && cpt <= 0x2B73F) return true;
|
||||
|
||||
// CJK Extension D
|
||||
if (cpt >= 0x2B740 && cpt <= 0x2B81F) return true;
|
||||
|
||||
// CJK Extension E
|
||||
if (cpt >= 0x2B820 && cpt <= 0x2CEAF) return true;
|
||||
|
||||
// CJK Extension F
|
||||
if (cpt >= 0x2CEB0 && cpt <= 0x2EBEF) return true;
|
||||
|
||||
// CJK Compatibility Ideographs
|
||||
if (cpt >= 0xF900 && cpt <= 0xFAFF) return true;
|
||||
|
||||
// CJK Compatibility Ideographs Supplement
|
||||
if (cpt >= 0x2F800 && cpt <= 0x2FA1F) return true;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {
|
||||
// unicode categories
|
||||
static const std::map<std::string, int> k_ucat_enum = {
|
||||
|
@ -63,4 +63,6 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||
|
||||
uint32_t unicode_tolower(uint32_t cpt);
|
||||
|
||||
bool unicode_cpt_is_han(uint32_t cpt);
|
||||
|
||||
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs);
|
||||
|
Reference in New Issue
Block a user