talk-llama : sync llama.cpp

ggml-ci
2025-08-11 00:14:50 +02:00 · 2025-07-12 16:26:16 +03:00
parent 6d64e4abf3
commit 6ddff4d96a
24 changed files with 2831 additions and 690 deletions
--- a/examples/talk-llama/llama-memory-hybrid.cpp
+++ b/examples/talk-llama/llama-memory-hybrid.cpp
@ -70,7 +70,7 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
                // if all tokens are output, split by sequence
                ubatch = balloc.split_seq(n_ubatch);
            } else {
-                ubatch = balloc.split_equal(n_ubatch);
+                ubatch = balloc.split_equal(n_ubatch, false);
            }

            if (ubatch.n_tokens == 0) {
@ -80,6 +80,11 @@ llama_memory_context_ptr llama_memory_hybrid::init_batch(llama_batch_allocr & ba
            ubatches.push_back(std::move(ubatch)); // NOLINT
        }

+        if (balloc.get_n_used() < balloc.get_n_tokens()) {
+            // failed to find a suitable split
+            break;
+        }
+
        // prepare the recurrent batches first
        if (!mem_recr->prepare(ubatches)) {
            // TODO: will the recurrent cache be in an undefined context at this point?
@ -195,11 +200,11 @@ llama_memory_hybrid_context::llama_memory_hybrid_context(

 llama_memory_hybrid_context::llama_memory_hybrid_context(
              llama_memory_hybrid * mem,
-            std::vector<uint32_t>   heads_attn,
+                  slot_info_vec_t   sinfos_attn,
        std::vector<llama_ubatch>   ubatches) :
    ubatches(std::move(ubatches)),
    // note: here we copy the ubatches. not sure if this is ideal
-    ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(heads_attn), this->ubatches)),
+    ctx_attn(new llama_kv_cache_unified_context(mem->get_mem_attn(), std::move(sinfos_attn), this->ubatches)),
    ctx_recr(new llama_memory_recurrent_context(mem->get_mem_recr(),                        this->ubatches)),
    status(llama_memory_status_combine(ctx_attn->get_status(), ctx_recr->get_status())) {
 }