talk-llama : sync llama.cpp

ggml-ci
2025-08-14 08:18:46 +02:00 · 2025-05-27 17:08:24 +03:00
parent 546928c33f
commit 26eb48cb08
18 changed files with 1968 additions and 1178 deletions
--- a/examples/talk-llama/llama-graph.h
+++ b/examples/talk-llama/llama-graph.h
@ -19,6 +19,7 @@ struct llama_cparams;

 class llama_memory_i;
 class llama_kv_cache_unified;
+class llama_kv_cache_unified_iswa;
 class llama_kv_cache_recurrent;

 // certain models (typically multi-modal) can produce different types of graphs
@ -255,6 +256,31 @@ public:

    void set_input(const llama_ubatch * ubatch) override;

+    ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
+
+    ggml_tensor * self_kq_mask     = nullptr; // F32 [n_kv, n_batch]
+    ggml_tensor * self_kq_mask_cnv = nullptr; //     [n_kv, n_batch]
+
+    const llama_hparams & hparams;
+    const llama_cparams & cparams;
+
+    const llama_kv_cache_unified * kv_self;
+};
+
+class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_kv_unified_iswa(
+            const llama_hparams & hparams,
+            const llama_cparams & cparams,
+            const llama_kv_cache_unified_iswa * kv_self) :
+        hparams(hparams),
+        cparams(cparams),
+        kv_self(kv_self) {
+    }
+    ~llm_graph_input_attn_kv_unified_iswa() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }

@ -266,7 +292,7 @@ public:
    const llama_hparams & hparams;
    const llama_cparams & cparams;

-    const llama_kv_cache_unified * kv_self;
+    const llama_kv_cache_unified_iswa * kv_self;
 };

 class llm_graph_input_attn_cross : public llm_graph_input_i {
@ -378,7 +404,6 @@ struct llm_graph_context {
    const int64_t n_layer;
    const int64_t n_rot;
    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
-    const int64_t n_ctx_per_seq;
    const int64_t n_head;
    const int64_t n_head_kv;
    const int64_t n_embd_head_k;
@ -507,13 +532,12 @@ struct llm_graph_context {

    ggml_tensor * build_attn_mha(
             ggml_cgraph * gf,
-             ggml_tensor * q,     // [n_embd_head_q, n_tokens, n_head_q]
-             ggml_tensor * k,     // [n_embd_head_k, n_tokens, n_head_k]
-             ggml_tensor * v,     // [n_embd_head_v, n_tokens, n_head_v] (v_trans == false)
+             ggml_tensor * q,       // [n_embd_head_q, n_head_q, n_tokens]
+             ggml_tensor * k,       // [n_embd_head_k, n_head_k, n_tokens]
+             ggml_tensor * v,       // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
             ggml_tensor * kq_b,
             ggml_tensor * kq_mask,
-             ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
-                    bool   v_trans,
+             ggml_tensor * v_mla,   // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                   float   kq_scale) const;

    llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
@ -546,6 +570,21 @@ struct llm_graph_context {
                  float   kq_scale,
                    int   il) const;

+    llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
+
+    ggml_tensor * build_attn(
+            llm_graph_input_attn_kv_unified_iswa * inp,
+            ggml_cgraph * gf,
+            ggml_tensor * wo,
+            ggml_tensor * wo_b,
+            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * kq_b,
+            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
+                  float   kq_scale,
+                    int   il) const;
+
    llm_graph_input_attn_cross * build_attn_inp_cross() const;

    ggml_tensor * build_attn(
@ -596,3 +635,6 @@ struct llm_graph_context {
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;
 };
+
+// TODO: better name
+int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);