talk-llama : sync llama.cpp

ggml-ci
2025-08-09 15:56:13 +02:00 · 2025-06-01 14:07:36 +03:00
parent 3f46282cbe
commit 7fd6fa8097
22 changed files with 4265 additions and 3552 deletions
--- a/examples/talk-llama/llama-context.h
+++ b/examples/talk-llama/llama-context.h
@ -18,6 +18,9 @@ struct llama_kv_cache;
 class llama_io_read_i;
 class llama_io_write_i;

+class llama_memory_i;
+class llama_memory_state_i;
+
 struct llama_context {
    // init scheduler and compute buffers, reserve worst-case graphs
    llama_context(
@ -47,7 +50,9 @@ struct llama_context {
          llama_kv_cache * get_kv_self();
    const llama_kv_cache * get_kv_self() const;

-    void kv_self_update();
+    // return true of the KV cache was updated
+    // TODO: remove
+    bool kv_self_update();

    enum llama_pooling_type pooling_type() const;

@ -88,6 +93,16 @@ struct llama_context {
                int32_t   il_start,
                int32_t   il_end);

+    // process a single ubatch with a specific graph type
+    // if memory_state is provided, it will be applied first to the context's memory
+    // ret contains the status of the graph computation
+    // returns nullptr only if ret != GGML_STATUS_SUCCESS
+    llm_graph_result_ptr process_ubatch(
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+            llama_memory_state_i * mstate,
+                     ggml_status & ret);
+
    int encode(llama_batch & inp_batch);
    int decode(llama_batch & inp_batch);

@ -180,16 +195,18 @@ public:
    ggml_cgraph * graph_init();

    // returns the result of ggml_backend_sched_graph_compute_async execution
-    ggml_status graph_compute(
-            ggml_cgraph * gf,
-                   bool   batched);
+    ggml_status graph_compute(ggml_cgraph * gf, bool batched);
+
+    // reserve a graph with a dummy ubatch of the specified size
+    ggml_cgraph * graph_reserve(uint32_t n_tokens, uint32_t n_seqs, uint32_t n_outputs, const llama_memory_state_i * mstate);

 private:
    llm_graph_result_ptr graph_build(
-            ggml_context * ctx,
-             ggml_cgraph * gf,
-      const llama_ubatch & ubatch,
-          llm_graph_type   gtype);
+                    ggml_context * ctx,
+                     ggml_cgraph * gf,
+              const llama_ubatch & ubatch,
+                  llm_graph_type   gtype,
+      const llama_memory_state_i * mstate);

    llm_graph_cb graph_get_cb() const;