mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-08-11 15:58:05 +02:00
talk-llama : sync llama.cpp
This commit is contained in:
@ -33,13 +33,19 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
|
||||
GGML_ASSERT(kv_size % n_pad == 0);
|
||||
|
||||
// TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
|
||||
auto n_layer_cache = hparams.n_layer;
|
||||
if (model.arch == LLM_ARCH_GEMMA3N) {
|
||||
n_layer_cache = 20;
|
||||
}
|
||||
|
||||
// create a context for each buffer type
|
||||
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
||||
auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
|
||||
auto it = ctx_map.find(buft);
|
||||
if (it == ctx_map.end()) {
|
||||
ggml_init_params params = {
|
||||
/*.mem_size =*/ size_t(2u*hparams.n_layer*ggml_tensor_overhead()),
|
||||
/*.mem_size =*/ size_t(2u*n_layer_cache*ggml_tensor_overhead()),
|
||||
/*.mem_buffer =*/ NULL,
|
||||
/*.no_alloc =*/ true,
|
||||
};
|
||||
@ -62,7 +68,7 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
|
||||
cells.resize(kv_size);
|
||||
|
||||
for (uint32_t il = 0; il < hparams.n_layer; il++) {
|
||||
for (uint32_t il = 0; il < n_layer_cache; il++) {
|
||||
if (filter && !filter(il)) {
|
||||
LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
|
||||
continue;
|
||||
@ -102,6 +108,26 @@ llama_kv_cache_unified::llama_kv_cache_unified(
|
||||
layers.push_back({ il, k, v });
|
||||
}
|
||||
|
||||
// TODO: this is temporary until we support passing reuse layer filters [KV_REUSE]
|
||||
if (model.arch == LLM_ARCH_GEMMA3N) {
|
||||
LLAMA_LOG_DEBUG("%s: GEMMA3N: reuse layers [%d, %d]\n", __func__, n_layer_cache, hparams.n_layer - 1);
|
||||
|
||||
for (uint32_t il = n_layer_cache; il < hparams.n_layer; il++) {
|
||||
if (filter && !filter(il)) {
|
||||
LLAMA_LOG_DEBUG("%s: layer %3d: skipped\n", __func__, il);
|
||||
continue;
|
||||
}
|
||||
|
||||
const bool is_swa = hparams.is_swa(il);
|
||||
const uint32_t il_reuse = n_layer_cache - (is_swa ? 2 : 1);
|
||||
|
||||
GGML_ASSERT(map_layer_ids.find(il_reuse) != map_layer_ids.end());
|
||||
map_layer_ids[il] = map_layer_ids[il_reuse];
|
||||
|
||||
LLAMA_LOG_DEBUG("%s: layer %3d: reuse layer %d, isw = %d\n", __func__, il, il_reuse, is_swa);
|
||||
}
|
||||
}
|
||||
|
||||
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
||||
for (auto it : ctx_map) {
|
||||
auto * buft = it.first;
|
||||
@ -307,7 +333,7 @@ llama_pos llama_kv_cache_unified::seq_pos_max(llama_seq_id seq_id) const {
|
||||
return cells.seq_pos_max(seq_id);
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified::init_batch(
|
||||
llama_memory_context_ptr llama_kv_cache_unified::init_batch(
|
||||
llama_batch_allocr & balloc,
|
||||
uint32_t n_ubatch,
|
||||
bool embd_all) {
|
||||
@ -332,18 +358,18 @@ llama_memory_state_ptr llama_kv_cache_unified::init_batch(
|
||||
break;
|
||||
}
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_state>(
|
||||
return std::make_unique<llama_kv_cache_unified_context>(
|
||||
this, std::move(heads), std::move(ubatches));
|
||||
} while (false);
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_state>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
return std::make_unique<llama_kv_cache_unified_context>(LLAMA_MEMORY_STATUS_FAILED_PREPARE);
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified::init_full() {
|
||||
return std::make_unique<llama_kv_cache_unified_state>(this);
|
||||
llama_memory_context_ptr llama_kv_cache_unified::init_full() {
|
||||
return std::make_unique<llama_kv_cache_unified_context>(this);
|
||||
}
|
||||
|
||||
llama_memory_state_ptr llama_kv_cache_unified::init_update(llama_context * lctx, bool optimize) {
|
||||
llama_memory_context_ptr llama_kv_cache_unified::init_update(llama_context * lctx, bool optimize) {
|
||||
bool do_shift = get_has_shift();
|
||||
|
||||
defrag_info dinfo;
|
||||
@ -373,7 +399,7 @@ llama_memory_state_ptr llama_kv_cache_unified::init_update(llama_context * lctx,
|
||||
}
|
||||
}
|
||||
|
||||
return std::make_unique<llama_kv_cache_unified_state>(this, lctx, do_shift, std::move(dinfo));
|
||||
return std::make_unique<llama_kv_cache_unified_context>(this, lctx, do_shift, std::move(dinfo));
|
||||
}
|
||||
|
||||
llama_kv_cache_unified::ubatch_heads llama_kv_cache_unified::prepare(const std::vector<llama_ubatch> & ubatches) {
|
||||
@ -1710,18 +1736,18 @@ bool llama_kv_cache_unified::state_read_data(llama_io_read_i & io, uint32_t cell
|
||||
}
|
||||
|
||||
//
|
||||
// llama_kv_cache_unified_state
|
||||
// llama_kv_cache_unified_context
|
||||
//
|
||||
|
||||
llama_kv_cache_unified_state::llama_kv_cache_unified_state(llama_memory_status status) : status(status) {}
|
||||
llama_kv_cache_unified_context::llama_kv_cache_unified_context(llama_memory_status status) : status(status) {}
|
||||
|
||||
llama_kv_cache_unified_state::llama_kv_cache_unified_state(
|
||||
llama_kv_cache_unified_context::llama_kv_cache_unified_context(
|
||||
llama_kv_cache_unified * kv) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv) {
|
||||
n_kv = kv->get_size();
|
||||
head = 0;
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_state::llama_kv_cache_unified_state(
|
||||
llama_kv_cache_unified_context::llama_kv_cache_unified_context(
|
||||
llama_kv_cache_unified * kv,
|
||||
llama_context * lctx,
|
||||
bool do_shift,
|
||||
@ -1731,15 +1757,15 @@ llama_kv_cache_unified_state::llama_kv_cache_unified_state(
|
||||
}
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_state::llama_kv_cache_unified_state(
|
||||
llama_kv_cache_unified_context::llama_kv_cache_unified_context(
|
||||
llama_kv_cache_unified * kv,
|
||||
llama_kv_cache_unified::ubatch_heads heads,
|
||||
std::vector<llama_ubatch> ubatches) : status(LLAMA_MEMORY_STATUS_SUCCESS), kv(kv), heads(std::move(heads)), ubatches(std::move(ubatches)) {
|
||||
}
|
||||
|
||||
llama_kv_cache_unified_state::~llama_kv_cache_unified_state() = default;
|
||||
llama_kv_cache_unified_context::~llama_kv_cache_unified_context() = default;
|
||||
|
||||
bool llama_kv_cache_unified_state::next() {
|
||||
bool llama_kv_cache_unified_context::next() {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
if (++i_next >= ubatches.size()) {
|
||||
@ -1749,8 +1775,8 @@ bool llama_kv_cache_unified_state::next() {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool llama_kv_cache_unified_state::apply() {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
bool llama_kv_cache_unified_context::apply() {
|
||||
assert(!llama_memory_status_is_fail(status));
|
||||
|
||||
// no ubatches -> this is a KV cache update
|
||||
if (ubatches.empty()) {
|
||||
@ -1767,45 +1793,45 @@ bool llama_kv_cache_unified_state::apply() {
|
||||
return true;
|
||||
}
|
||||
|
||||
llama_memory_status llama_kv_cache_unified_state::get_status() const {
|
||||
llama_memory_status llama_kv_cache_unified_context::get_status() const {
|
||||
return status;
|
||||
}
|
||||
|
||||
const llama_ubatch & llama_kv_cache_unified_state::get_ubatch() const {
|
||||
const llama_ubatch & llama_kv_cache_unified_context::get_ubatch() const {
|
||||
assert(status == LLAMA_MEMORY_STATUS_SUCCESS);
|
||||
|
||||
return ubatches[i_next];
|
||||
}
|
||||
|
||||
uint32_t llama_kv_cache_unified_state::get_n_kv() const {
|
||||
uint32_t llama_kv_cache_unified_context::get_n_kv() const {
|
||||
return n_kv;
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_unified_state::get_k(ggml_context * ctx, int32_t il) const {
|
||||
ggml_tensor * llama_kv_cache_unified_context::get_k(ggml_context * ctx, int32_t il) const {
|
||||
return kv->get_k(ctx, il, n_kv);
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_unified_state::get_v(ggml_context * ctx, int32_t il) const {
|
||||
ggml_tensor * llama_kv_cache_unified_context::get_v(ggml_context * ctx, int32_t il) const {
|
||||
return kv->get_v(ctx, il, n_kv);
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_unified_state::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
|
||||
ggml_tensor * llama_kv_cache_unified_context::cpy_k(ggml_context * ctx, ggml_tensor * k_cur, int32_t il) const {
|
||||
return kv->cpy_k(ctx, k_cur, il, head);
|
||||
}
|
||||
|
||||
ggml_tensor * llama_kv_cache_unified_state::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
|
||||
ggml_tensor * llama_kv_cache_unified_context::cpy_v(ggml_context * ctx, ggml_tensor * v_cur, int32_t il) const {
|
||||
return kv->cpy_v(ctx, v_cur, il, head);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_state::set_input_k_shift(ggml_tensor * dst) const {
|
||||
void llama_kv_cache_unified_context::set_input_k_shift(ggml_tensor * dst) const {
|
||||
kv->set_input_k_shift(dst);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_state::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
||||
void llama_kv_cache_unified_context::set_input_kq_mask(ggml_tensor * dst, const llama_ubatch * ubatch, bool causal_attn) const {
|
||||
kv->set_input_kq_mask(dst, ubatch, causal_attn);
|
||||
}
|
||||
|
||||
void llama_kv_cache_unified_state::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
||||
void llama_kv_cache_unified_context::set_input_pos_bucket(ggml_tensor * dst, const llama_ubatch * ubatch) const {
|
||||
kv->set_input_pos_bucket(dst, ubatch);
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user