metal : optimize FA kernels (llama/10171)

* ggml : add ggml_flash_attn_ext_get_prec * metal : use F16 precision in FA kernels ggml-ci * metal : minor clean-up * metal : compile-guard bf16 FA kernels ggml-ci * build : remove obsolete compile flag [no ci] * metal : prevent int overflows [no ci] * cuda : disable BF16 FA ggml-ci * metal : fix BF16 requirement for FA kernels ggml-ci * make : clean-up [no ci]
2025-08-19 12:44:24 +02:00 · 2024-11-08 13:47:22 +02:00
parent 1550be79f1
commit d0b8335789
6 changed files with 494 additions and 338 deletions
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -1746,6 +1746,9 @@ extern "C" {
            struct ggml_tensor * a,
            enum ggml_prec       prec);

+    GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
+            const struct ggml_tensor * a);
+
    // TODO: needs to be adapted to ggml_flash_attn_ext
    GGML_API struct ggml_tensor * ggml_flash_attn_back(
           struct ggml_context * ctx,