CUDA: fix q_nope_absorbed prec for DS 2 Lite f16 (llama/13137)

This commit is contained in:
Johannes Gäßler 2025-04-28 09:29:26 +02:00 committed by Georgi Gerganov
parent 9fff2f751c
commit 670bf02662
2 changed files with 4 additions and 4 deletions

View File

@ -393,8 +393,8 @@ extern "C" {
// precision // precision
enum ggml_prec { enum ggml_prec {
GGML_PREC_DEFAULT, GGML_PREC_DEFAULT = 0, // stored as ggml_tensor.op_params, 0 by default
GGML_PREC_F32, GGML_PREC_F32 = 10,
}; };
// model file types // model file types

View File

@ -1935,8 +1935,8 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst); ggml_cuda_mul_mat_vec(ctx, src0, src1, nullptr, dst);
} else if (!split && use_mul_mat_vec_q) { } else if (!split && use_mul_mat_vec_q) {
ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst); ggml_cuda_mul_mat_vec_q(ctx, src0, src1, nullptr, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) } else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16) &&
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { dst->op_params[0] == GGML_PREC_DEFAULT && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// general KQ + KQV multi-batch without FlashAttention // general KQ + KQV multi-batch without FlashAttention
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_mul_mat_vec) { } else if (use_mul_mat_vec) {