talk-llama : sync llama.cpp (#3084)

ggml-ci
2025-08-18 06:10:10 +02:00 · 2025-04-28 16:40:23 +03:00
parent 28dcdff4c5
commit f3c42399a3
36 changed files with 16940 additions and 12400 deletions
--- a/examples/talk-llama/llama-hparams.h
+++ b/examples/talk-llama/llama-hparams.h
@@ -36,12 +36,17 @@ struct llama_hparams {
    uint32_t n_layer;
    uint32_t n_rot;
    uint32_t n_swa = 0; // sliding window attention (SWA)
+    uint32_t n_swa_pattern = 1; // by default, all layers use non-sliding-window attention
    uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
    uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
    uint32_t n_expert = 0;
    uint32_t n_expert_used = 0;
    uint32_t n_rel_attn_bkts = 0;

+    // note: deepseek2 using MLA converts into MQA with larger heads, then decompresses to MHA
+    uint32_t n_embd_head_k_mla = 0;
+    uint32_t n_embd_head_v_mla = 0;
+
    // for WavTokenizer
    struct llama_hparams_posnet   posnet;
    struct llama_hparams_convnext convnext;
@@ -75,10 +80,16 @@ struct llama_hparams {
    uint32_t time_decay_extra_dim   = 0;
    uint32_t wkv_head_size          = 0;
    uint32_t token_shift_count      = 2;
+    uint32_t n_lora_decay           = 0;
+    uint32_t n_lora_iclr            = 0;
+    uint32_t n_lora_value_res_mix   = 0;
+    uint32_t n_lora_gate            = 0;

    float    rope_attn_factor = 1.0f;
    float    rope_freq_base_train;
+    float    rope_freq_base_train_swa;
    float    rope_freq_scale_train;
+    float    rope_freq_scale_train_swa;
    uint32_t n_ctx_orig_yarn;
    float    rope_yarn_log_mul;

@@ -105,6 +116,14 @@ struct llama_hparams {
    bool use_alibi     = false;
    bool attn_soft_cap = false;

+    uint32_t n_moe_layer_step        = 0;
+    bool     use_kq_norm             = true;
+    uint32_t n_attn_chunk            = 0;
+    // values below seems to be fixed on llama4
+    uint32_t n_no_rope_layer_step    = 4;
+    uint32_t n_attn_temp_floor_scale = 8192;
+    float    f_attn_temp_scale       = 0.1;
+
    // needed by encoder-decoder models (e.g. T5, FLAN-T5)
    // ref: https://github.com/ggerganov/llama.cpp/pull/8141
    llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
@@ -133,6 +152,8 @@ struct llama_hparams {

    // dimension of the recurrent state embeddings
    uint32_t n_embd_v_s() const;
+
+    bool is_swa(uint32_t il) const;
 };

 static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");