Merge branch 'master' into feat/ci-test

2025-08-18 08:50:37 +02:00 · 2024-08-21 16:20:17 +03:00
parent f8895f3c84 9e3c5345cd
commit 7ef875a9c0
49 changed files with 1593 additions and 145002 deletions
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -451,6 +451,8 @@ extern "C" {
        GGML_OP_SQR,
        GGML_OP_SQRT,
        GGML_OP_LOG,
        GGML_OP_SIN,
        GGML_OP_COS,
        GGML_OP_SUM,
        GGML_OP_SUM_ROWS,
        GGML_OP_MEAN,
@@ -967,6 +969,22 @@ extern "C" {
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_sin(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_sin_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_cos(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    GGML_API struct ggml_tensor * ggml_cos_inplace(
            struct ggml_context * ctx,
            struct ggml_tensor  * a);
    // return scalar
    GGML_API struct ggml_tensor * ggml_sum(
            struct ggml_context * ctx,
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -2181,6 +2181,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_ADD:
            ggml_cuda_op_add(ctx, dst);
            break;
        case GGML_OP_SUB:
            ggml_cuda_op_sub(ctx, dst);
            break;
        case GGML_OP_ACC:
            ggml_cuda_op_acc(ctx, dst);
            break;
@@ -2267,6 +2270,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
        case GGML_OP_SQRT:
            ggml_cuda_op_sqrt(ctx, dst);
            break;
        case GGML_OP_SIN:
            ggml_cuda_op_sin(ctx, dst);
            break;
        case GGML_OP_COS:
            ggml_cuda_op_cos(ctx, dst);
            break;
        case GGML_OP_CLAMP:
            ggml_cuda_op_clamp(ctx, dst);
            break;
@@ -2853,12 +2862,15 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
        case GGML_OP_TRANSPOSE:
        case GGML_OP_NORM:
        case GGML_OP_ADD:
        case GGML_OP_SUB:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_RMS_NORM:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
        case GGML_OP_SIN:
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
        case GGML_OP_CONT:
        case GGML_OP_DIAG_MASK_INF:
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -9,6 +9,10 @@ static __device__ __forceinline__ float op_add(const float a, const float b) {
    return a + b;
 }
 static __device__ __forceinline__ float op_sub(const float a, const float b) {
    return a - b;
 }
 static __device__ __forceinline__ float op_mul(const float a, const float b) {
    return a * b;
 }
@@ -271,6 +275,10 @@ void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_add>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
 void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_sub>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    ggml_cuda_op_bin_bcast<bin_bcast_cuda<op_mul>>(dst->src[0], dst->src[1], dst, dst->src[0]->data, dst->src[1]->data, dst->data, ctx.stream());
 }
--- a/ggml/src/ggml-cuda/binbcast.cuh
+++ b/ggml/src/ggml-cuda/binbcast.cuh
@@ -2,5 +2,6 @@
 void ggml_cuda_op_repeat(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_add(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_sub(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_mul(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_div(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-cuda/unary.cu
+++ b/ggml/src/ggml-cuda/unary.cu
@@ -101,6 +101,24 @@ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
    dst[i] = sqrtf(x[i]);
 }
 static __global__ void sin_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = sinf(x[i]);
 }
 static __global__ void cos_f32(const float * x, float * dst, const int k) {
    const int i = blockDim.x*blockIdx.x + threadIdx.x;
    if (i >= k) {
        return;
    }
    dst[i] = cosf(x[i]);
 }
 static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
    gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
@@ -156,6 +174,16 @@ static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_
    sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
    sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
    const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
    cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
 }
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
@@ -312,3 +340,31 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
 void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
 void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
    const ggml_tensor * src0 = dst->src[0];
    const float * src0_d = (const float *)src0->data;
    float * dst_d = (float *)dst->data;
    cudaStream_t stream = ctx.stream();
    GGML_ASSERT(ggml_is_contiguous(src0));
    GGML_ASSERT(src0->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
    cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
 }
--- a/ggml/src/ggml-cuda/unary.cuh
+++ b/ggml/src/ggml-cuda/unary.cuh
@@ -9,6 +9,8 @@
 #define CUDA_HARDSWISH_BLOCK_SIZE 256
 #define CUDA_SQR_BLOCK_SIZE 256
 #define CUDA_SQRT_BLOCK_SIZE 256
 #define CUDA_SIN_BLOCK_SIZE 256
 #define CUDA_COS_BLOCK_SIZE 256
 void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@@ -31,3 +33,7 @@ void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
 void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
 void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -31,6 +31,8 @@ struct ggml_metal_kernel {
 enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_ADD,
    GGML_METAL_KERNEL_TYPE_ADD_ROW,
    GGML_METAL_KERNEL_TYPE_SUB,
    GGML_METAL_KERNEL_TYPE_SUB_ROW,
    GGML_METAL_KERNEL_TYPE_MUL,
    GGML_METAL_KERNEL_TYPE_MUL_ROW,
    GGML_METAL_KERNEL_TYPE_DIV,
@@ -205,6 +207,9 @@ enum ggml_metal_kernel_type {
    GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
    GGML_METAL_KERNEL_TYPE_CONCAT,
    GGML_METAL_KERNEL_TYPE_SQR,
    GGML_METAL_KERNEL_TYPE_SQRT,
    GGML_METAL_KERNEL_TYPE_SIN,
    GGML_METAL_KERNEL_TYPE_COS,
    GGML_METAL_KERNEL_TYPE_SUM_ROWS,
    GGML_METAL_KERNEL_TYPE_COUNT
@@ -491,6 +496,8 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD,                           add,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ADD_ROW,                       add_row,                        true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB,                           sub,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUB_ROW,                       sub_row,                        true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL,                           mul,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_ROW,                       mul_row,                        true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_DIV,                           div,                            true);
@@ -665,6 +672,9 @@ static struct ggml_backend_metal_context * ggml_metal_init(int n_cb) {
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,                cpy_f32_iq4_nl,                 true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CONCAT,                        concat,                         true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQR,                           sqr,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SQRT,                          sqrt,                           true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIN,                           sin,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_COS,                           cos,                            true);
        GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SUM_ROWS,                      sum_rows,                       true);
    }
@@ -765,15 +775,20 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_context * ctx
        case GGML_OP_PERMUTE:
        case GGML_OP_CONCAT:
        case GGML_OP_ADD:
        case GGML_OP_SUB:
        case GGML_OP_ACC:
        case GGML_OP_MUL:
        case GGML_OP_DIV:
        case GGML_OP_REPEAT:
        case GGML_OP_SCALE:
        case GGML_OP_CLAMP:
        case GGML_OP_SQR:
        case GGML_OP_SUM_ROWS:
            return true;
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
        case GGML_OP_SIN:
        case GGML_OP_COS:
            return ggml_is_contiguous(op->src[0]);
        case GGML_OP_SUM_ROWS:
        case GGML_OP_SOFT_MAX:
        case GGML_OP_RMS_NORM:
        case GGML_OP_GROUP_NORM:
@@ -1050,6 +1065,7 @@ static enum ggml_status ggml_metal_graph_compute(
                        [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
                    } break;
                case GGML_OP_ADD:
                case GGML_OP_SUB:
                case GGML_OP_MUL:
                case GGML_OP_DIV:
                    {
@@ -1073,6 +1089,7 @@ static enum ggml_status ggml_metal_graph_compute(
                            nb = ne00 / 4;
                            switch (dst->op) {
                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
                                case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
                                default: GGML_ABORT("fatal error");
@@ -1082,6 +1099,7 @@ static enum ggml_status ggml_metal_graph_compute(
                        } else {
                            switch (dst->op) {
                                case GGML_OP_ADD: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
                                case GGML_OP_SUB: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
                                case GGML_OP_MUL: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
                                case GGML_OP_DIV: pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
                                default: GGML_ABORT("fatal error");
@@ -1409,6 +1427,48 @@ static enum ggml_status ggml_metal_graph_compute(
                        const int64_t n = ggml_nelements(dst);
                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_SQRT:
                    {
                        GGML_ASSERT(ggml_is_contiguous(src0));
                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
                        const int64_t n = ggml_nelements(dst);
                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_SIN:
                    {
                        GGML_ASSERT(ggml_is_contiguous(src0));
                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIN].pipeline;
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
                        const int64_t n = ggml_nelements(dst);
                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_COS:
                    {
                        GGML_ASSERT(ggml_is_contiguous(src0));
                        id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_COS].pipeline;
                        [encoder setComputePipelineState:pipeline];
                        [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
                        [encoder setBuffer:id_dst  offset:offs_dst atIndex:1];
                        const int64_t n = ggml_nelements(dst);
                        [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
                    } break;
                case GGML_OP_SUM_ROWS:
--- a/ggml/src/ggml-metal.metal
+++ b/ggml/src/ggml-metal.metal
@@ -17,7 +17,7 @@ enum ggml_sort_order {
    GGML_SORT_ORDER_DESC,
 };
-// general-purpose kernel for addition, multiplication and division of two tensors
+// general-purpose kernel for addition, subtraction, multiplication and division of two tensors
 // pros: works for non-contiguous tensors, supports broadcast across all dims
 // cons: not very efficient
 kernel void kernel_add(
@@ -70,6 +70,56 @@ kernel void kernel_add(
    }
 }
 kernel void kernel_sub(
        device const char * src0,
        device const char * src1,
        device       char * dst,
        constant  int64_t & ne00,
        constant  int64_t & ne01,
        constant  int64_t & ne02,
        constant  int64_t & ne03,
        constant uint64_t & nb00,
        constant uint64_t & nb01,
        constant uint64_t & nb02,
        constant uint64_t & nb03,
        constant  int64_t & ne10,
        constant  int64_t & ne11,
        constant  int64_t & ne12,
        constant  int64_t & ne13,
        constant uint64_t & nb10,
        constant uint64_t & nb11,
        constant uint64_t & nb12,
        constant uint64_t & nb13,
        constant  int64_t & ne0,
        constant  int64_t & ne1,
        constant  int64_t & ne2,
        constant  int64_t & ne3,
        constant uint64_t & nb0,
        constant uint64_t & nb1,
        constant uint64_t & nb2,
        constant uint64_t & nb3,
        constant  int64_t & offs,
        uint3 tgpig[[threadgroup_position_in_grid]],
        uint3 tpitg[[thread_position_in_threadgroup]],
        uint3   ntg[[threads_per_threadgroup]]) {
    const int64_t i03 = tgpig.z;
    const int64_t i02 = tgpig.y;
    const int64_t i01 = tgpig.x;
    const int64_t i13 = i03 % ne13;
    const int64_t i12 = i02 % ne12;
    const int64_t i11 = i01 % ne11;
    device const char * src0_ptr = src0 + i03*nb03 + i02*nb02 + i01*nb01 + offs;
    device const char * src1_ptr = src1 + i13*nb13 + i12*nb12 + i11*nb11;
    device       char * dst_ptr  = dst  + i03*nb3  + i02*nb2  + i01*nb1  + offs;
    for (int i0 = tpitg.x; i0 < ne0; i0 += ntg.x) {
        const int i10 = i0 % ne10;
        *((device float *)(dst_ptr + i0*nb0)) = *((device float *)(src0_ptr + i0*nb00)) - *((device float *)(src1_ptr + i10*nb10));
    }
 }
 kernel void kernel_mul(
        device const char * src0,
        device const char * src1,
@@ -226,6 +276,15 @@ kernel void kernel_add_row(
    dst[tpig] = src0[tpig] + src1[tpig % nb];
 }
 kernel void kernel_sub_row(
        device const float4 * src0,
        device const float4 * src1,
        device       float4 * dst,
        constant   uint64_t & nb [[buffer(28)]],
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = src0[tpig] - src1[tpig % nb];
 }
 kernel void kernel_mul_row(
        device const float4 * src0,
        device const float4 * src1,
@@ -358,6 +417,27 @@ kernel void kernel_sqr(
    dst[tpig] = src0[tpig] * src0[tpig];
 }
 kernel void kernel_sqrt(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = sqrt(src0[tpig]);
 }
 kernel void kernel_sin(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = sin(src0[tpig]);
 }
 kernel void kernel_cos(
        device const float * src0,
        device       float * dst,
        uint tpig[[thread_position_in_grid]]) {
    dst[tpig] = cos(src0[tpig]);
 }
 kernel void kernel_sum_rows(
        device const float * src0,
        device       float * dst,
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -3644,7 +3644,7 @@ void quantize_row_q8_K(const float * restrict x, void * restrict y, int64_t k) {
    quantize_row_q8_K_ref(x, y, k);
 }
-//===================================== Dot ptoducts =================================
+//===================================== Dot products =================================
 //
 // Helper functions
--- a/ggml/src/ggml-vulkan-shaders.hpp
+++ b/ggml/src/ggml-vulkan-shaders.hpp
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -184,6 +184,8 @@ struct vk_device_struct {
    vk_pipeline pipeline_upscale_f32;
    vk_pipeline pipeline_scale_f32;
    vk_pipeline pipeline_sqr_f32;
    vk_pipeline pipeline_sin_f32;
    vk_pipeline pipeline_cos_f32;
    vk_pipeline pipeline_clamp_f32;
    vk_pipeline pipeline_pad_f32;
    vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
@@ -1654,6 +1656,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
    ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_sin_f32, "sin_f32", sin_f32_len, sin_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_cos_f32, "cos_f32", cos_f32_len, cos_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
    ggml_vk_create_pipeline(device, device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
@@ -3972,6 +3976,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
            return ctx->device->pipeline_sqr_f32;
        }
        return nullptr;
    case GGML_OP_SIN:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_sin_f32;
        }
        return nullptr;
    case GGML_OP_COS:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_cos_f32;
        }
        return nullptr;
    case GGML_OP_CLAMP:
        if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
            return ctx->device->pipeline_clamp_f32;
@@ -4124,6 +4138,8 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) {
    case GGML_OP_UPSCALE:
    case GGML_OP_SCALE:
    case GGML_OP_SQR:
    case GGML_OP_SIN:
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
        return true;
@@ -4335,6 +4351,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
        case GGML_OP_MUL:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SIN:
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
        case GGML_OP_PAD:
        case GGML_OP_CPY:
@@ -4576,6 +4594,32 @@ static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const
    });
 }
 static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
    });
 }
 static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    const uint32_t src0_type_size = ggml_type_size(src0->type);
    const uint32_t dst_type_size = ggml_type_size(dst->type);
    ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
        (uint32_t)ggml_nelements(src0),
        (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
        (uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] /  dst_type_size, (uint32_t) dst->nb[1] /  dst_type_size, (uint32_t) dst->nb[2] /  dst_type_size, (uint32_t) dst->nb[3] /  dst_type_size,
        0,
        0.0f, 0.0f,
    });
 }
 static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst) {
    float * op_params = (float *)dst->op_params;
    const uint32_t src0_type_size = ggml_type_size(src0->type);
@@ -5481,6 +5525,8 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
    case GGML_OP_ADD:
    case GGML_OP_SCALE:
    case GGML_OP_SQR:
    case GGML_OP_SIN:
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_CPY:
@@ -5761,6 +5807,8 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    case GGML_OP_UPSCALE:
    case GGML_OP_SCALE:
    case GGML_OP_SQR:
    case GGML_OP_SIN:
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_CPY:
@@ -5832,6 +5880,14 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
    case GGML_OP_SQR:
        ggml_vk_sqr(ctx, compute_ctx, src0, node);
        break;
    case GGML_OP_SIN:
        ggml_vk_sin(ctx, compute_ctx, src0, node);
        break;
    case GGML_OP_COS:
        ggml_vk_cos(ctx, compute_ctx, src0, node);
        break;
    case GGML_OP_CLAMP:
        ggml_vk_clamp(ctx, compute_ctx, src0, node);
@@ -5943,6 +5999,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
    case GGML_OP_UPSCALE:
    case GGML_OP_SCALE:
    case GGML_OP_SQR:
    case GGML_OP_SIN:
    case GGML_OP_COS:
    case GGML_OP_CLAMP:
    case GGML_OP_PAD:
    case GGML_OP_CPY:
@@ -6658,6 +6716,8 @@ GGML_CALL static bool ggml_backend_vk_supports_op(ggml_backend_t backend, const
        case GGML_OP_UPSCALE:
        case GGML_OP_SCALE:
        case GGML_OP_SQR:
        case GGML_OP_SIN:
        case GGML_OP_COS:
        case GGML_OP_CLAMP:
        case GGML_OP_PAD:
        case GGML_OP_CONT:
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2310,7 +2310,9 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
 inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
 inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
 inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
-inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
+inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);  }
 inline static void ggml_vec_sin_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sinf(x[i]);  }
 inline static void ggml_vec_cos_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = cosf(x[i]);  }
 inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
 inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
 inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
@@ -2760,6 +2762,8 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "SQR",
    "SQRT",
    "LOG",
    "SIN",
    "COS",
    "SUM",
    "SUM_ROWS",
    "MEAN",
@@ -2833,7 +2837,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
    "CROSS_ENTROPY_LOSS_BACK",
 };
-static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
+static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "none",
@@ -2848,6 +2852,8 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "x^2",
    "√x",
    "log(x)",
    "sin(x)",
    "cos(x)",
    "Σx",
    "Σx_k",
    "Σx/n",
@@ -2921,7 +2927,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
    "cross_entropy_loss_back(x,y)",
 };
-static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
+static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 76");
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@@ -4882,6 +4888,72 @@ struct ggml_tensor * ggml_log_inplace(
    return ggml_log_impl(ctx, a, true);
 }
 // ggml_sin
 static struct ggml_tensor * ggml_sin_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        bool inplace) {
    bool is_node = false;
    if (!inplace && (a->grad)) {
        is_node = true;
    }
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    result->op   = GGML_OP_SIN;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    return result;
 }
 struct ggml_tensor * ggml_sin(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sin_impl(ctx, a, false);
 }
 struct ggml_tensor * ggml_sin_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_sin_impl(ctx, a, true);
 }
 // ggml_cos
 static struct ggml_tensor * ggml_cos_impl(
        struct ggml_context * ctx,
        struct ggml_tensor  * a,
        bool inplace) {
    bool is_node = false;
    if (!inplace && (a->grad)) {
        is_node = true;
    }
    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
    result->op   = GGML_OP_COS;
    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
    result->src[0] = a;
    return result;
 }
 struct ggml_tensor * ggml_cos(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_cos_impl(ctx, a, false);
 }
 struct ggml_tensor * ggml_cos_inplace(
        struct ggml_context * ctx,
        struct ggml_tensor  * a) {
    return ggml_cos_impl(ctx, a, true);
 }
 // ggml_sum
 struct ggml_tensor * ggml_sum(
@@ -10512,6 +10584,96 @@ static void ggml_compute_forward_log(
    }
 }
 // ggml_compute_forward_sin
 static void ggml_compute_forward_sin_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    if (params->ith != 0) {
        return;
    }
    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];
    GGML_ASSERT( dst->nb[0] == sizeof(float));
    GGML_ASSERT(src0->nb[0] == sizeof(float));
    for (int i = 0; i < n; i++) {
        ggml_vec_sin_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
 }
 static void ggml_compute_forward_sin(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_sin_f32(params, dst);
            } break;
        default:
            {
                GGML_ABORT("fatal error");
            }
    }
 }
 // ggml_compute_forward_cos
 static void ggml_compute_forward_cos_f32(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    if (params->ith != 0) {
        return;
    }
    GGML_ASSERT(ggml_are_same_shape(src0, dst));
    const int n  = ggml_nrows(src0);
    const int nc = src0->ne[0];
    GGML_ASSERT( dst->nb[0] == sizeof(float));
    GGML_ASSERT(src0->nb[0] == sizeof(float));
    for (int i = 0; i < n; i++) {
        ggml_vec_cos_f32(nc,
                (float *) ((char *) dst->data  + i*( dst->nb[1])),
                (float *) ((char *) src0->data + i*(src0->nb[1])));
    }
 }
 static void ggml_compute_forward_cos(
        const struct ggml_compute_params * params,
        struct ggml_tensor * dst) {
    const struct ggml_tensor * src0 = dst->src[0];
    switch (src0->type) {
        case GGML_TYPE_F32:
            {
                ggml_compute_forward_cos_f32(params, dst);
            } break;
        default:
            {
                GGML_ABORT("fatal error");
            }
    }
 }
 // ggml_compute_forward_sum
 static void ggml_compute_forward_sum_f32(
@@ -16787,6 +16949,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
            {
                ggml_compute_forward_log(params, tensor);
            } break;
        case GGML_OP_SIN:
            {
                ggml_compute_forward_sin(params, tensor);
            } break;
        case GGML_OP_COS:
            {
                ggml_compute_forward_cos(params, tensor);
            } break;
        case GGML_OP_SUM:
            {
                ggml_compute_forward_sum(params, tensor);
@@ -17433,6 +17603,30 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                zero_table);
                }
            } break;
        case GGML_OP_SIN:
            {
                if (src0->grad) {
                    src0->grad =
                        ggml_add_or_set(ctx,
                                src0->grad,
                                ggml_mul(ctx,
                                    tensor->grad,
                                    ggml_cos(ctx, src0)),
                                zero_table);
                }
            } break;
        case GGML_OP_COS:
            {
                if (src0->grad) {
                    src0->grad =
                        ggml_sub_or_set(ctx,
                                src0->grad,
                                ggml_mul(ctx,
                                    tensor->grad,
                                    ggml_sin(ctx, src0)),
                                zero_table);
                }
            } break;
        case GGML_OP_SUM:
            {
                if (src0->grad) {
@@ -18520,6 +18714,8 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
        case GGML_OP_SQR:
        case GGML_OP_SQRT:
        case GGML_OP_LOG:
        case GGML_OP_SIN:
        case GGML_OP_COS:
        case GGML_OP_SUM:
        case GGML_OP_SUM_ROWS:
        case GGML_OP_MEAN:
--- a/ggml/src/vulkan-shaders/CMakeLists.txt
+++ b/ggml/src/vulkan-shaders/CMakeLists.txt
@@ -0,0 +1,7 @@
 find_package (Threads REQUIRED)
 set(TARGET vulkan-shaders-gen)
 add_executable(${TARGET} vulkan-shaders-gen.cpp)
 install(TARGETS ${TARGET} RUNTIME)
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
 target_link_libraries(vulkan-shaders-gen PUBLIC Threads::Threads)
--- a/ggml/src/vulkan-shaders/add.comp
+++ b/ggml/src/vulkan-shaders/add.comp
@@ -4,9 +4,11 @@
 #include "generic_binary_head.comp"
 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) + FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) + FLOAT_TYPE(data_b[src1_idx(idx)]));
 }
--- a/ggml/src/vulkan-shaders/clamp.comp
+++ b/ggml/src/vulkan-shaders/clamp.comp
@@ -4,10 +4,12 @@
 #include "generic_unary_head.comp"
 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val < p.param1 ? p.param1 : (val > p.param2 ? p.param2 : val));
 }
--- a/ggml/src/vulkan-shaders/concat.comp
+++ b/ggml/src/vulkan-shaders/concat.comp
@@ -0,0 +1,35 @@
 #version 450
 #include "types.comp"
 #include "generic_binary_head.comp"
 void main() {
    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
    const int dim = p.param3;
    if (idx >= p.ne) {
        return;
    }
    const uint i3 = idx / (p.ne22*p.ne21*p.ne20);
    const uint i3_offset = i3 * p.ne22*p.ne21*p.ne20;
    const uint i2 = (idx - i3_offset) / (p.ne21*p.ne20);
    const uint i2_offset = i2*p.ne21*p.ne20;
    const uint i1 = (idx - i3_offset - i2_offset) / p.ne20;
    const uint i0 = idx - i3_offset - i2_offset - i1*p.ne20;
    uint o[4] = {0, 0, 0, 0};
    o[dim] = dim == 0 ? p.ne00 : (dim == 1 ? p.ne01 : (dim == 2 ? p.ne02 : p.ne03));
    const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
    const uint src1_idx = (i3 - o[3])*p.nb13 + (i2 - o[2])*p.nb12 + (i1 - o[1])*p.nb11 + (i0 - o[0])*p.nb10;
    const uint dst_idx = i3*p.nb23 + i2*p.nb22 + i1*p.nb21 + i0*p.nb20;
    const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
    data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : data_b[src1_idx]);
 #else
    data_d[p.d_offset + dst_idx] = is_src0 ? data_a[src0_idx] : data_b[src1_idx];
 #endif
 }
--- a/ggml/src/vulkan-shaders/copy.comp
+++ b/ggml/src/vulkan-shaders/copy.comp
@@ -4,13 +4,15 @@
 #include "generic_unary_head.comp"
 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
 #ifndef OPTIMIZATION_ERROR_WORKAROUND
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(data_a[src0_idx(idx)]);
 #else
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = data_a[src0_idx(gl_GlobalInvocationID.x)];
+    data_d[p.d_offset + dst_idx(idx)] = data_a[src0_idx(idx)];
 #endif
 }
--- a/ggml/src/vulkan-shaders/cos.comp
+++ b/ggml/src/vulkan-shaders/cos.comp
@@ -0,0 +1,15 @@
 #version 450
 #include "types.comp"
 #include "generic_unary_head.comp"
 void main() {
    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(cos(val));
 }
--- a/ggml/src/vulkan-shaders/dequant_funcs.comp
+++ b/ggml/src/vulkan-shaders/dequant_funcs.comp
@@ -58,3 +58,11 @@ vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    return vec2(int(data_a[a_offset + ib].qs[iqs]), int(data_a[a_offset + ib].qs[iqs + 1])) * d;
 }
 #endif
 #if defined(DATA_A_IQ4_NL)
 vec2 dequantize(uint ib, uint iqs, uint a_offset) {
    const float d = float(data_a[a_offset + ib].d);
    const uint vui = uint(data_a[a_offset + ib].qs[iqs]);
    return vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
 }
 #endif
--- a/ggml/src/vulkan-shaders/dequant_iq4_nl.comp
+++ b/ggml/src/vulkan-shaders/dequant_iq4_nl.comp
@@ -0,0 +1,30 @@
 #version 450
 #include "dequant_head.comp"
 layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {block_iq4_nl data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 void main() {
    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
    const uint tid = gl_LocalInvocationID.x % 64;
    const uint il  = tid/32;
    const uint ir  = tid%32;
    const uint ib = 32*i + ir;
    if (ib >= p.nel / 32) {
        return;
    }
    const uint q_idx = 8*il;
    const uint b_idx = 1024*i + 32*ir + q_idx;
    const float d = float(data_a[ib].d);
    [[unroll]] for (uint l = 0; l < 8; ++l) {
        data_b[b_idx + l +  0] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] & 0xF]);
        data_b[b_idx + l + 16] = D_TYPE(d * kvalues_iq4nl[data_a[ib].qs[q_idx + l] >>  4]);
    }
 }
--- a/ggml/src/vulkan-shaders/dequant_q4_0.comp
+++ b/ggml/src/vulkan-shaders/dequant_q4_0.comp
@@ -18,15 +18,13 @@ void main() {
        return;
    }
-    const uint b_idx = 1024*i + 32*ir + 8*il;
+    const uint q_idx = 8*il;
    const uint b_idx = 1024*i + 32*ir + q_idx;
    const float d = float(data_a[ib].d);
    const float dm = -8.0f * d;
    const uint q_idx = 8*il;
    [[unroll]] for (uint l = 0; l < 8; ++l) {
-        data_b[b_idx + l +  0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm);
+        data_b[b_idx + l +  0] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] & 0xF) - 8.0f));
-        data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >>  4) + dm);
+        data_b[b_idx + l + 16] = D_TYPE(d * ((data_a[ib].qs[q_idx + l] >>  4) - 8.0f));
    }
 }
--- a/ggml/src/vulkan-shaders/div.comp
+++ b/ggml/src/vulkan-shaders/div.comp
@@ -4,9 +4,11 @@
 #include "generic_binary_head.comp"
 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) / FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) / FLOAT_TYPE(data_b[src1_idx(idx)]));
 }
--- a/ggml/src/vulkan-shaders/gelu.comp
+++ b/ggml/src/vulkan-shaders/gelu.comp
@@ -13,7 +13,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
    const float GELU_COEF_A    = 0.044715f;
    const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
-    const uint i = gl_GlobalInvocationID.x;
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
    if (i >= p.KX) {
        return;
--- a/ggml/src/vulkan-shaders/gelu_quick.comp
+++ b/ggml/src/vulkan-shaders/gelu_quick.comp
@@ -0,0 +1,23 @@
 #version 450
 #include "generic_head.comp"
 #include "types.comp"
 #extension GL_EXT_control_flow_attributes : enable
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
    const float GELU_QUICK_COEF = -1.702f;
    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
    if (i >= p.KX) {
        return;
    }
    const float x = float(data_a[i]);
    data_d[i] = D_TYPE(x * (1.0f / (1.0f + exp(GELU_QUICK_COEF * x))));
 }
--- a/ggml/src/vulkan-shaders/generic_binary_head.comp
+++ b/ggml/src/vulkan-shaders/generic_binary_head.comp
@@ -7,7 +7,7 @@ layout (push_constant) uniform parameter
    uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
    uint ne20; uint ne21; uint ne22; uint ne23; uint nb20; uint nb21; uint nb22; uint nb23;
    uint d_offset;
-    float param1; float param2;
+    float param1; float param2; int param3;
 } p;
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
@@ -16,6 +16,10 @@ layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
 layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
 uint get_idx() {
    return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 }
 uint src0_idx(uint idx) {
    const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
--- a/ggml/src/vulkan-shaders/generic_unary_head.comp
+++ b/ggml/src/vulkan-shaders/generic_unary_head.comp
@@ -14,6 +14,10 @@ layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 uint get_idx() {
    return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
 }
 uint src0_idx(uint idx) {
    const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
    const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
--- a/ggml/src/vulkan-shaders/group_norm.comp
+++ b/ggml/src/vulkan-shaders/group_norm.comp
@@ -0,0 +1,66 @@
 #version 450
 #include "generic_head.comp"
 #include "types.comp"
 #extension GL_EXT_control_flow_attributes : enable
 #define BLOCK_SIZE 512
 layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 shared float tmp[BLOCK_SIZE];
 void main() {
    const uint group_size = p.KX;
    const float eps = p.param1;
    const uint tid = gl_LocalInvocationID.x;
    const uint start = gl_WorkGroupID.x * group_size + tid;
    const uint end = start + group_size;
    tmp[tid] = 0.0f;
    // Calculate mean
    [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
        tmp[tid] += float(data_a[col]);
    }
    // tmp up partial tmps and write back result
    barrier();
    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
        barrier();
    }
    const float mean = tmp[0] / group_size;
    barrier();
    tmp[tid] = 0.0f;
    // Calculate variance
    [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
        const float xi = float(data_a[col]) - mean;
        data_d[col] = D_TYPE(xi);
        tmp[tid] += xi * xi;
    }
    // sum up partial sums and write back result
    barrier();
    [[unroll]] for (int s = BLOCK_SIZE / 2; s > 0; s >>= 1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
        barrier();
    }
    const float variance = tmp[0] / group_size;
    const float scale = inversesqrt(variance + eps);
    [[unroll]] for (uint col = start; col < end; col += BLOCK_SIZE) {
        data_d[col] *= D_TYPE(scale);
    }
 }
--- a/ggml/src/vulkan-shaders/im2col.comp
+++ b/ggml/src/vulkan-shaders/im2col.comp
@@ -0,0 +1,57 @@
 #version 450
 #extension GL_EXT_shader_16bit_storage : require
 layout (push_constant) uniform parameter
 {
    uint batch_offset; uint offset_delta;
    uint IC;
    uint IW; uint IH;
    uint OW; uint OH;
    uint KW; uint KH;
    uint pelements;
    uint CHW;
    int s0; int s1;
    int p0; int p1;
    int d0; int d1;
 } p;
 #include "types.comp"
 #define BLOCK_SIZE 256
 layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
    const uint i = gl_GlobalInvocationID.x;
    if (i >= p.pelements) {
        return;
    }
    const uint ksize = p.OW * (p.KH > 1 ? p.KW : 1);
    const uint kx = i / ksize;
    const uint kd = kx * ksize;
    const uint ky = (i - kd) / p.OW;
    const uint ix = i % p.OW;
    const uint oh = gl_GlobalInvocationID.y;
    const uint batch = gl_GlobalInvocationID.z / p.IC;
    const uint ic = gl_GlobalInvocationID.z % p.IC;
    const uint iiw = ix * p.s0 + kx * p.d0 - p.p0;
    const uint iih = oh * p.s1 + ky * p.d1 - p.p1;
    const uint offset_dst =
        ((batch * p.OH + oh) * p.OW + ix) * p.CHW +
        (ic * (p.KW * p.KH) + ky * p.KW + kx);
    if (iih < 0 || iih >= p.IH || iiw < 0 || iiw >= p.IW) {
        data_d[offset_dst] = D_TYPE(0.0f);
    } else {
        const uint offset_src = ic * p.offset_delta + batch * p.batch_offset;
        data_d[offset_dst] = D_TYPE(data_a[offset_src + iih * p.IW + iiw]);
    }
 }
--- a/ggml/src/vulkan-shaders/leaky_relu.comp
+++ b/ggml/src/vulkan-shaders/leaky_relu.comp
@@ -0,0 +1,22 @@
 #version 450
 #include "generic_head.comp"
 #include "types.comp"
 #extension GL_EXT_control_flow_attributes : enable
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
    if (i >= p.KX) {
        return;
    }
    const float val = float(data_a[i]);
    data_d[i] = D_TYPE(max(val, 0.0f) + min(val, 0.0f) * p.param1);
 }
--- a/ggml/src/vulkan-shaders/mul.comp
+++ b/ggml/src/vulkan-shaders/mul.comp
@@ -4,9 +4,11 @@
 #include "generic_binary_head.comp"
 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(data_b[src1_idx(gl_GlobalInvocationID.x)]));
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(data_b[src1_idx(idx)]));
 }
--- a/ggml/src/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/vulkan-shaders/mul_mat_vec.comp
@@ -16,6 +16,13 @@ void main() {
    const uint row = gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z;
    const uint tid = gl_LocalInvocationID.x;
    // There are not enough cols to use all threads
    if (tid >= p.ncols) {
        return;
    }
    const uint block_size = min(p.ncols, BLOCK_SIZE);
    uint a_offset, b_offset, d_offset;
    get_offsets(a_offset, b_offset, d_offset);
@@ -23,8 +30,8 @@ void main() {
    tmp[tid] = FLOAT_TYPE(0.0f);
-    [[unroll]] for (uint i = 0; i < p.ncols/BLOCK_SIZE; i += 2) {
+    [[unroll]] for (uint i = 0; i < p.ncols/block_size; i += 2) {
-        const uint col = i*BLOCK_SIZE + 2*tid;
+        const uint col = i*block_size + 2*tid;
        const uint ib = (row*p.ncols + col)/QUANT_K; // block index
        const uint iqs = (col%QUANT_K)/QUANT_R; // quant index
        const uint iybs = col - col%QUANT_K; // y block start index
@@ -38,7 +45,7 @@ void main() {
    // sum up partial sums and write back result
    barrier();
-    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
+    [[unroll]] for (uint s = block_size/2; s > 0; s >>= 1) {
        if (tid < s) {
            tmp[tid] += tmp[tid + s];
        }
--- a/ggml/src/vulkan-shaders/mul_mm.comp
+++ b/ggml/src/vulkan-shaders/mul_mm.comp
@@ -71,7 +71,7 @@ shared FLOAT_TYPE buf_a[BM * (BK+1)];
 shared FLOAT_TYPE buf_b[BN * (BK+1)];
 #ifdef MUL_MAT_ID
-shared u16vec2 row_ids[2048];
+shared u16vec2 row_ids[3072];
 #endif
 void main() {
@@ -380,6 +380,19 @@ void main() {
            buf_a[buf_idx    ] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi    ] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi    ] >> qhshift) & 3) << 4)) - 32));
            buf_a[buf_idx + 1] = FLOAT_TYPE(dscale * float(int8_t(((data_a[ib].ql[qsi + 1] >> (b * 4)) & 0xF) | (((data_a[ib].qh[qhi + 1] >> qhshift) & 3) << 4)) - 32));
 #elif defined(DATA_A_IQ4_NL)
            const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
            const uint buf_idx = (loadc_a + l) * (BK+1) + loadr_a;
            const uint ib = idx / 16;
            const uint iqs = idx & 0xF;
            const float d = float(data_a[ib].d);
            const uint vui = uint(data_a[ib].qs[iqs]);
            const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
            buf_a[buf_idx     ] = FLOAT_TYPE(v.x);
            buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
 #endif
        }
        [[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
--- a/ggml/src/vulkan-shaders/norm.comp
+++ b/ggml/src/vulkan-shaders/norm.comp
@@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 shared vec2 sum[BLOCK_SIZE];
 void main() {
-    const uint row = gl_WorkGroupID.x;
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
    const uint tid = gl_LocalInvocationID.x;
    sum[tid] = vec2(0.0f, 0.0f);
--- a/ggml/src/vulkan-shaders/pad.comp
+++ b/ggml/src/vulkan-shaders/pad.comp
@@ -0,0 +1,26 @@
 #version 450
 #include "types.comp"
 #include "generic_unary_head.comp"
 void main() {
    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
    if (idx >= p.ne) {
        return;
    }
    const uint i3 = idx / (p.ne12*p.ne11*p.ne10);
    const uint i3_offset = i3 * p.ne12*p.ne11*p.ne10;
    const uint i2 = (idx - i3_offset) / (p.ne11*p.ne10);
    const uint i2_offset = i2*p.ne11*p.ne10;
    const uint i1 = (idx - i3_offset - i2_offset) / p.ne10;
    const uint i0 = idx - i3_offset - i2_offset - i1*p.ne10;
    const uint src0_idx = i3*p.nb03 + i2*p.nb02 + i1*p.nb01 + i0*p.nb00;
    const uint dst_idx = i3*p.nb13 + i2*p.nb12 + i1*p.nb11 + i0*p.nb10;
    const bool is_src0 = i0 < p.ne00 && i1 < p.ne01 && i2 < p.ne02 && i3 < p.ne03;
    data_d[p.d_offset + dst_idx] = D_TYPE(is_src0 ? data_a[src0_idx] : 0.0f);
 }
--- a/ggml/src/vulkan-shaders/relu.comp
+++ b/ggml/src/vulkan-shaders/relu.comp
@@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
-    const uint i = gl_GlobalInvocationID.x;
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
    if (i >= p.KX) {
        return;
--- a/ggml/src/vulkan-shaders/rms_norm.comp
+++ b/ggml/src/vulkan-shaders/rms_norm.comp
@@ -14,7 +14,7 @@ layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 shared FLOAT_TYPE sum[BLOCK_SIZE];
 void main() {
-    const uint row = gl_WorkGroupID.x;
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
    const uint tid = gl_LocalInvocationID.x;
    sum[tid] = FLOAT_TYPE(0.0f); // partial sum for thread in warp
--- a/ggml/src/vulkan-shaders/scale.comp
+++ b/ggml/src/vulkan-shaders/scale.comp
@@ -4,9 +4,11 @@
 #include "generic_unary_head.comp"
 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]) * FLOAT_TYPE(p.param1));
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1));
 }
--- a/ggml/src/vulkan-shaders/silu.comp
+++ b/ggml/src/vulkan-shaders/silu.comp
@@ -11,7 +11,7 @@ layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
-    const uint i = gl_GlobalInvocationID.x;
+    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
    if (i >= p.KX) {
        return;
--- a/ggml/src/vulkan-shaders/sin.comp
+++ b/ggml/src/vulkan-shaders/sin.comp
@@ -0,0 +1,15 @@
 #version 450
 #include "types.comp"
 #include "generic_unary_head.comp"
 void main() {
    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(sin(val));
 }
--- a/ggml/src/vulkan-shaders/soft_max.comp
+++ b/ggml/src/vulkan-shaders/soft_max.comp
@@ -28,7 +28,7 @@ shared FLOAT_TYPE vals[BLOCK_SIZE];
 void main() {
    const uint tid = gl_LocalInvocationID.x;
-    const uint rowx = gl_WorkGroupID.x;
+    const uint rowx = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
    const uint rowy = rowx % p.KY;
    float slope = 1.0f;
--- a/ggml/src/vulkan-shaders/square.comp
+++ b/ggml/src/vulkan-shaders/square.comp
@@ -4,10 +4,12 @@
 #include "generic_unary_head.comp"
 void main() {
-    if (gl_GlobalInvocationID.x >= p.ne) {
+    const uint idx = get_idx();
    if (idx >= p.ne) {
        return;
    }
-    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(gl_GlobalInvocationID.x)]);
+    const FLOAT_TYPE val = FLOAT_TYPE(data_a[src0_idx(idx)]);
-    data_d[p.d_offset + dst_idx(gl_GlobalInvocationID.x)] = D_TYPE(val * val);
+    data_d[p.d_offset + dst_idx(idx)] = D_TYPE(val * val);
 }
--- a/ggml/src/vulkan-shaders/sum_rows.comp
+++ b/ggml/src/vulkan-shaders/sum_rows.comp
@@ -14,7 +14,7 @@ layout (constant_id = 0) const uint BLOCK_SIZE = 32;
 shared FLOAT_TYPE tmp[BLOCK_SIZE];
 void main() {
-    const uint row = gl_WorkGroupID.x;
+    const uint row = gl_WorkGroupID.z * 262144 + gl_WorkGroupID.y * 512 + gl_WorkGroupID.x;
    const uint col = gl_LocalInvocationID.x;
    tmp[col] = FLOAT_TYPE(0.0f);
--- a/ggml/src/vulkan-shaders/tanh.comp
+++ b/ggml/src/vulkan-shaders/tanh.comp
@@ -0,0 +1,21 @@
 #version 450
 #include "generic_head.comp"
 #include "types.comp"
 #extension GL_EXT_control_flow_attributes : enable
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
    const uint i = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
    if (i >= p.KX) {
        return;
    }
    data_d[i] = D_TYPE(tanh(data_a[i]));
 }
--- a/ggml/src/vulkan-shaders/timestep_embedding.comp
+++ b/ggml/src/vulkan-shaders/timestep_embedding.comp
@@ -0,0 +1,41 @@
 #version 450
 #extension GL_EXT_shader_16bit_storage : require
 layout (push_constant) uniform parameter
 {
    uint nb1;
    uint dim;
    uint max_period;
 } p;
 #include "types.comp"
 #extension GL_EXT_control_flow_attributes : enable
 #define BLOCK_SIZE 256
 layout(local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer X {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
    const uint i = gl_WorkGroupID.y;
    const uint j = gl_GlobalInvocationID.x;
    const uint d_offset = i * p.nb1;
    if (p.dim % 2 != 0 && j == ((p.dim + 1) / 2)) {
        data_d[d_offset + p.dim] = 0.f;
    }
    const uint half_dim = p.dim / 2;
    if (j >= half_dim) {
        return;
    }
    const float timestep = float(data_a[i]);
    const float freq = float(exp(-log(p.max_period) * j / half_dim));
    const float arg = timestep * freq;
    data_d[d_offset + j] = D_TYPE(cos(arg));
    data_d[d_offset + j + half_dim] = D_TYPE(sin(arg));
 }
--- a/ggml/src/vulkan-shaders/types.comp
+++ b/ggml/src/vulkan-shaders/types.comp
@@ -6,7 +6,7 @@
 #define QUANT_K 1
 #define QUANT_R 1
-#ifndef LOAD_VEC_A
+#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
 #define A_TYPE float
 #elif LOAD_VEC_A == 4
 #define A_TYPE vec4
@@ -19,7 +19,7 @@
 #define QUANT_K 1
 #define QUANT_R 1
-#ifndef LOAD_VEC_A
+#if !defined(LOAD_VEC_A) || LOAD_VEC_A == 1
 #define A_TYPE float16_t
 #elif LOAD_VEC_A == 4
 #define A_TYPE f16vec4
@@ -177,3 +177,24 @@ struct block_q6_K
 #define A_TYPE block_q6_K
 #endif
 // IQuants
 #if defined(DATA_A_IQ4_NL)
 #extension GL_EXT_shader_16bit_storage : require
 #define QUANT_K 32
 #define QUANT_R 2
 struct block_iq4_nl
 {
    float16_t d;
    uint8_t qs[QUANT_K/2];
 };
 #define A_TYPE block_iq4_nl
 const int8_t kvalues_iq4nl[16] = {
    int8_t(-127), int8_t(-104), int8_t(-83), int8_t(-65), int8_t(-49), int8_t(-35), int8_t(-22), int8_t(-10),
    int8_t(1), int8_t(13), int8_t(25), int8_t(38), int8_t(53), int8_t(69), int8_t(89), int8_t(113)
 };
 #endif
--- a/ggml/src/vulkan-shaders/upscale.comp
+++ b/ggml/src/vulkan-shaders/upscale.comp
@@ -0,0 +1,36 @@
 #version 450
 layout (push_constant) uniform parameter
 {
    uint ne; uint d_offset;
    uint nb00; uint nb01; uint nb02; uint nb03;
    uint ne10; uint ne11; uint ne12; uint ne13;
    float sf0; float sf1; float sf2; float sf3;
 } p;
 #include "types.comp"
 layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in;
 layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_d[];};
 void main() {
    const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
    if (idx >= p.ne) {
        return;
    }
    const uint i10 = idx % p.ne10;
    const uint i11 = (idx / p.ne10) % p.ne11;
    const uint i12 = (idx / (p.ne10 * p.ne11)) % p.ne12;
    const uint i13 = (idx / (p.ne10 * p.ne11 * p.ne12)) % p.ne13;
    const uint i00 = uint(i10 / p.sf0);
    const uint i01 = uint(i11 / p.sf1);
    const uint i02 = uint(i12 / p.sf2);
    const uint i03 = uint(i13 / p.sf3);
    data_d[p.d_offset + idx] = D_TYPE(data_a[i03 * p.nb03 + i02 * p.nb02 + i01 * p.nb01 + i00 * p.nb00]);
 }
--- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
+++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
@@ -0,0 +1,587 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
 #include <string>
 #include <stdexcept>
 #include <array>
 #include <vector>
 #include <map>
 #include <thread>
 #include <mutex>
 #include <future>
 #include <queue>
 #include <condition_variable>
 #include <cstdio>
 #include <cstring>
 #include <cstdlib>
 #include <sys/stat.h>
 #include <sys/types.h>
 #ifdef _WIN32
    #include <windows.h>
    #include <direct.h> // For _mkdir on Windows
    #include <algorithm> // For std::replace on w64devkit
 #else
    #include <unistd.h>
    #include <sys/wait.h>
    #include <fcntl.h>
 #endif
 #define ASYNCIO_CONCURRENCY 64
 std::mutex lock;
 std::vector<std::pair<std::string, std::string>> shader_fnames;
 std::string GLSLC = "glslc";
 std::string input_dir = "vulkan-shaders";
 std::string output_dir = "/tmp";
 std::string target_hpp = "ggml-vulkan-shaders.hpp";
 std::string target_cpp = "ggml-vulkan-shaders.cpp";
 bool no_clean = false;
 const std::vector<std::string> type_names = {
    "f32",
    "f16",
    "q4_0",
    "q4_1",
    "q5_0",
    "q5_1",
    "q8_0",
    "q2_k",
    "q3_k",
    "q4_k",
    "q5_k",
    "q6_k",
    "iq4_nl"
 };
 void execute_command(const std::string& command, std::string& stdout_str, std::string& stderr_str) {
 #ifdef _WIN32
    HANDLE stdout_read, stdout_write;
    HANDLE stderr_read, stderr_write;
    SECURITY_ATTRIBUTES sa = { sizeof(SECURITY_ATTRIBUTES), NULL, TRUE };
    if (!CreatePipe(&stdout_read, &stdout_write, &sa, 0) ||
        !SetHandleInformation(stdout_read, HANDLE_FLAG_INHERIT, 0)) {
        throw std::runtime_error("Failed to create stdout pipe");
    }
    if (!CreatePipe(&stderr_read, &stderr_write, &sa, 0) ||
        !SetHandleInformation(stderr_read, HANDLE_FLAG_INHERIT, 0)) {
        throw std::runtime_error("Failed to create stderr pipe");
    }
    PROCESS_INFORMATION pi;
    STARTUPINFOA si = { sizeof(STARTUPINFOA) };
    si.dwFlags = STARTF_USESTDHANDLES;
    si.hStdOutput = stdout_write;
    si.hStdError = stderr_write;
    std::vector<char> cmd(command.begin(), command.end());
    cmd.push_back('\0');
    if (!CreateProcessA(NULL, cmd.data(), NULL, NULL, TRUE, 0, NULL, NULL, &si, &pi)) {
        throw std::runtime_error("Failed to create process");
    }
    CloseHandle(stdout_write);
    CloseHandle(stderr_write);
    std::array<char, 128> buffer;
    DWORD bytes_read;
    while (ReadFile(stdout_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
        stdout_str.append(buffer.data(), bytes_read);
    }
    while (ReadFile(stderr_read, buffer.data(), buffer.size(), &bytes_read, NULL) && bytes_read > 0) {
        stderr_str.append(buffer.data(), bytes_read);
    }
    CloseHandle(stdout_read);
    CloseHandle(stderr_read);
    WaitForSingleObject(pi.hProcess, INFINITE);
    CloseHandle(pi.hProcess);
    CloseHandle(pi.hThread);
 #else
 int stdout_pipe[2];
    int stderr_pipe[2];
    if (pipe(stdout_pipe) != 0 || pipe(stderr_pipe) != 0) {
        throw std::runtime_error("Failed to create pipes");
    }
    pid_t pid = fork();
    if (pid < 0) {
        throw std::runtime_error("Failed to fork process");
    }
    if (pid == 0) {
        close(stdout_pipe[0]);
        close(stderr_pipe[0]);
        dup2(stdout_pipe[1], STDOUT_FILENO);
        dup2(stderr_pipe[1], STDERR_FILENO);
        close(stdout_pipe[1]);
        close(stderr_pipe[1]);
        execl("/bin/sh", "sh", "-c", command.c_str(), (char*) nullptr);
        _exit(EXIT_FAILURE);
    } else {
        close(stdout_pipe[1]);
        close(stderr_pipe[1]);
        std::array<char, 128> buffer;
        ssize_t bytes_read;
        while ((bytes_read = read(stdout_pipe[0], buffer.data(), buffer.size())) > 0) {
            stdout_str.append(buffer.data(), bytes_read);
        }
        while ((bytes_read = read(stderr_pipe[0], buffer.data(), buffer.size())) > 0) {
            stderr_str.append(buffer.data(), bytes_read);
        }
        close(stdout_pipe[0]);
        close(stderr_pipe[0]);
        waitpid(pid, nullptr, 0);
    }
 #endif
 }
 bool directory_exists(const std::string& path) {
    struct stat info;
    if (stat(path.c_str(), &info) != 0) {
        return false; // Path doesn't exist or can't be accessed
    }
    return (info.st_mode & S_IFDIR) != 0; // Check if it is a directory
 }
 bool create_directory(const std::string& path) {
 #ifdef _WIN32
    return _mkdir(path.c_str()) == 0 || errno == EEXIST; // EEXIST means the directory already exists
 #else
    return mkdir(path.c_str(), 0755) == 0 || errno == EEXIST; // 0755 is the directory permissions
 #endif
 }
 std::string to_uppercase(const std::string& input) {
    std::string result = input;
    for (char& c : result) {
        c = std::toupper(c);
    }
    return result;
 }
 bool string_ends_with(const std::string& str, const std::string& suffix) {
    if (suffix.size() > str.size()) {
        return false;
    }
    return std::equal(suffix.rbegin(), suffix.rend(), str.rbegin());
 }
 static const char path_separator = '/';
 std::string join_paths(const std::string& path1, const std::string& path2) {
    return path1 + path_separator + path2;
 }
 std::string basename(const std::string &path) {
    return path.substr(path.find_last_of("/\\") + 1);
 }
 void string_to_spv(const std::string& _name, const std::string& in_fname, const std::map<std::string, std::string>& defines, bool fp16 = true) {
    std::string name = _name + (fp16 ? "" : "_fp32");
    std::string out_fname = join_paths(output_dir, name + ".spv");
    std::string in_path = join_paths(input_dir, in_fname);
    #ifdef _WIN32
        std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", "\"" + in_path + "\"", "-o", "\"" + out_fname + "\""};
    #else
        std::vector<std::string> cmd = {GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o",  out_fname};
    #endif
    for (const auto& define : defines) {
        cmd.push_back("-D" + define.first + "=" + define.second);
    }
    std::string command;
    for (const auto& part : cmd) {
        command += part + " ";
    }
    std::string stdout_str, stderr_str;
    try {
        // std::cout << "Executing command: ";
        // for (const auto& part : cmd) {
        //     std::cout << part << " ";
        // }
        // std::cout << std::endl;
        execute_command(command, stdout_str, stderr_str);
        if (!stderr_str.empty()) {
            std::cerr << "cannot compile " << name << "\n\n" << command << "\n\n" << stderr_str << std::endl;
            return;
        }
        std::lock_guard<std::mutex> guard(lock);
        shader_fnames.push_back(std::make_pair(name, out_fname));
    } catch (const std::exception& e) {
        std::cerr << "Error executing command for " << name << ": " << e.what() << std::endl;
    }
 }
 std::map<std::string, std::string> merge_maps(const std::map<std::string, std::string>& a, const std::map<std::string, std::string>& b) {
    std::map<std::string, std::string> result = a;
    result.insert(b.begin(), b.end());
    return result;
 }
 void matmul_shaders(std::vector<std::future<void>>& tasks, bool fp16, bool matmul_id) {
    std::string load_vec = fp16 ? "8" : "4";
    std::string aligned_b_type_f32 = fp16 ? "mat2x4" : "vec4";
    std::string aligned_b_type_f16 = fp16 ? "f16mat2x4" : "f16vec4";
    std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", fp16 ? "float16_t" : "float"}};
    std::string shader_name = "matmul";
    if (matmul_id) {
        base_dict["MUL_MAT_ID"] = "1";
        shader_name = "matmul_id";
    }
    if (fp16) {
        base_dict["FLOAT16"] = "1";
    }
    // Shaders with f16 B_TYPE
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv(shader_name + "_f32_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv(shader_name + "_f32_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F32", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv(shader_name + "_f16", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}), fp16);
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv(shader_name + "_f16_aligned", "mul_mm.comp", merge_maps(base_dict, {{"DATA_A_F16", "1"}, {"LOAD_VEC_A", load_vec}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f16}, {"D_TYPE", "float"}}), fp16);
    }));
    for (const auto& tname : type_names) {
        std::string data_a_key = "DATA_A_" + to_uppercase(tname);
        // For unaligned, load one at a time for f32/f16, or two at a time for quants
        std::string load_vec_a_unaligned = (tname == "f32" || tname == "f16") ? "1" : "2";
        // For aligned matmul loads
        std::string load_vec_a = (tname == "f32" || tname == "f16") ? load_vec : "2";
        tasks.push_back(std::async(std::launch::async, [=] {
            string_to_spv(shader_name + "_" + tname + "_f32", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a_unaligned}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}), fp16);
        }));
        tasks.push_back(std::async(std::launch::async, [=] {
            string_to_spv(shader_name + "_" + tname + "_f32_aligned", "mul_mm.comp", merge_maps(base_dict, {{data_a_key, "1"}, {"LOAD_VEC_A", load_vec_a}, {"LOAD_VEC_B", load_vec}, {"B_TYPE", aligned_b_type_f32}, {"D_TYPE", "float"}}), fp16);
        }));
    }
 }
 void process_shaders(std::vector<std::future<void>>& tasks) {
    std::cout << "ggml_vulkan: Generating and compiling shaders to SPIR-V" << std::endl;
    std::map<std::string, std::string> base_dict = {{"FLOAT_TYPE", "float"}};
    for (const auto& fp16 : {false, true}) {
        matmul_shaders(tasks, fp16, false);
        matmul_shaders(tasks, fp16, true);
    }
    for (const auto& tname : type_names) {
        // mul mat vec
        std::string data_a_key = "DATA_A_" + to_uppercase(tname);
        std::string shader = (string_ends_with(tname, "_k")) ? "mul_mat_vec_" + tname + ".comp" : "mul_mat_vec.comp";
        tasks.push_back(std::async(std::launch::async, [=] {
            string_to_spv("mul_mat_vec_" + tname + "_f32_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
        }));
        tasks.push_back(std::async(std::launch::async, [=] {
            string_to_spv("mul_mat_vec_" + tname + "_f16_f32", shader, merge_maps(base_dict, {{data_a_key, "1"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
        }));
        tasks.push_back(std::async(std::launch::async, [=] {
            string_to_spv("mul_mat_vec_id_" + tname + "_f32", shader, merge_maps(base_dict, {{"MUL_MAT_ID", "1"}, {data_a_key, "1"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
        }));
        // Dequant shaders
        if (tname != "f16") {
            tasks.push_back(std::async(std::launch::async, [=] {
                string_to_spv("dequant_" + tname, "dequant_" + tname + ".comp", merge_maps(base_dict, {{data_a_key, "1"}, {"D_TYPE", "float16_t"}}));
            }));
        }
        if (!string_ends_with(tname, "_k")) {
            shader = (tname == "f32" || tname == "f16") ? "get_rows.comp" : "get_rows_quant.comp";
            if (tname == "f16") {
                tasks.push_back(std::async(std::launch::async, [=] {
                    string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
                }));
            } else {
                tasks.push_back(std::async(std::launch::async, [=] {
                    string_to_spv("get_rows_" + tname, shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float16_t"}});
                }));
            }
            tasks.push_back(std::async(std::launch::async, [=] {
                string_to_spv("get_rows_" + tname + "_f32", shader, {{data_a_key, "1"}, {"B_TYPE", "int"}, {"D_TYPE", "float"}});
            }));
        }
    }
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    // Norms
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("norm_f32", "norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("group_norm_f32", "group_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("rms_norm_f32", "rms_norm.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("mul_f32", "mul.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("div_f32", "div.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("scale_f32", "scale.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("sqr_f32", "square.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("sin_f32", "sin.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("cos_f32", "cos.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("clamp_f32", "clamp.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("pad_f32", "pad.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("concat_f32", "concat.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("concat_f16", "concat.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("concat_i32", "concat.comp", {{"A_TYPE", "int"}, {"B_TYPE", "int"}, {"D_TYPE", "int"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("upscale_f32", "upscale.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("gelu_f32", "gelu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("gelu_quick_f32", "gelu_quick.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("silu_f32", "silu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("relu_f32", "relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("leaky_relu_f32", "leaky_relu.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("tanh_f32", "tanh.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("soft_max_f32", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}}));
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("soft_max_f32_f16", "soft_max.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"B_TYPE", "float16_t"}, {"D_TYPE", "float"}}));
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("rope_norm_f32", "rope_norm.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("rope_norm_f16", "rope_norm.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("rope_neox_f32", "rope_neox.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("rope_neox_f16", "rope_neox.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}});
    }));
    tasks.push_back(std::async(std::launch::async, [] {
        string_to_spv("argsort_f32", "argsort.comp", {{"A_TYPE", "float"}});
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("sum_rows_f32", "sum_rows.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("im2col_f32", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("im2col_f32_f16", "im2col.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}));
    }));
    tasks.push_back(std::async(std::launch::async, [=] {
        string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
    }));
 }
 void write_output_files() {
    FILE* hdr = fopen(target_hpp.c_str(), "w");
    FILE* src = fopen(target_cpp.c_str(), "w");
    fprintf(hdr, "#include <cstdint>\n\n");
    fprintf(src, "#include \"%s\"\n\n", basename(target_hpp).c_str());
    for (const auto& pair : shader_fnames) {
        const std::string& name = pair.first;
        #ifdef _WIN32
            std::string path = pair.second;
            std::replace(path.begin(), path.end(), '/', '\\' );
        #else
            const std::string& path = pair.second;
        #endif
        FILE* spv = fopen(path.c_str(), "rb");
        if (!spv) {
            std::cerr << "Error opening SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
            continue;
        }
        fseek(spv, 0, SEEK_END);
        size_t size = ftell(spv);
        fseek(spv, 0, SEEK_SET);
        std::vector<unsigned char> data(size);
        size_t read_size = fread(data.data(), 1, size, spv);
        fclose(spv);
        if (read_size != size) {
            std::cerr << "Error reading SPIR-V file: " << path << " (" << strerror(errno) << ")\n";
            continue;
        }
        fprintf(hdr, "extern unsigned char %s_data[%zu];\n", name.c_str(), size);
        fprintf(hdr, "const uint64_t %s_len = %zu;\n\n", name.c_str(), size);
        fprintf(src, "unsigned char %s_data[%zu] = {\n", name.c_str(), size);
        for (size_t i = 0; i < size; ++i) {
            fprintf(src, "0x%02x,", data[i]);
            if ((i + 1) % 12 == 0) fprintf(src, "\n");
        }
        fprintf(src, "\n};\n\n");
        if (!no_clean) {
            std::remove(path.c_str());
        }
    }
    fclose(hdr);
    fclose(src);
 }
 int main(int argc, char** argv) {
    std::map<std::string, std::string> args;
    for (int i = 1; i < argc; i += 2) {
        if (i + 1 < argc) {
            args[argv[i]] = argv[i + 1];
        }
    }
    if (args.find("--glslc") != args.end()) {
        GLSLC = args["--glslc"]; // Path to glslc
    }
    if (args.find("--input-dir") != args.end()) {
        input_dir = args["--input-dir"]; // Directory containing shader sources
    }
    if (args.find("--output-dir") != args.end()) {
        output_dir = args["--output-dir"]; // Directory for containing SPIR-V output
    }
    if (args.find("--target-hpp") != args.end()) {
        target_hpp = args["--target-hpp"]; // Path to generated header file
    }
    if (args.find("--target-cpp") != args.end()) {
        target_cpp = args["--target-cpp"]; // Path to generated cpp file
    }
    if (args.find("--no-clean") != args.end()) {
        no_clean = true; // Keep temporary SPIR-V files in output-dir after build
    }
    if (!directory_exists(input_dir)) {
        std::cerr << "\"" << input_dir << "\" must be a valid directory containing shader sources" << std::endl;
        return EXIT_FAILURE;
    }
    if (!directory_exists(output_dir)) {
        if (!create_directory(output_dir)) {
            std::cerr << "Error creating output directory: " << output_dir << "\n";
            return EXIT_FAILURE;
        }
    }
    std::vector<std::future<void>> tasks;
    process_shaders(tasks);
    for (auto& task : tasks) {
        task.get();
    }
    write_output_files();
    return EXIT_SUCCESS;
 }
--- a/scripts/sync-ggml-am.sh
+++ b/scripts/sync-ggml-am.sh
@@ -65,6 +65,7 @@ while read c; do
        src/ggml-cann/* \
        src/ggml-cuda/* \
        src/ggml-sycl/* \
        src/vulkan-shaders/* \
        include/ggml*.h \
        examples/common.h \
        examples/common.cpp \
--- a/scripts/sync-ggml.last
+++ b/scripts/sync-ggml.last
@@ -1 +1 @@
-a735a7b5fce27d23c2a6b0b3ccbb47b2c51e83e7
+46e22f59eaf0aaa38a8e525fd89ba95e39ba7435
`@@ -1 +1 @@`
	`a735a7b5fce27d23c2a6b0b3ccbb47b2c51e83e7`	`46e22f59eaf0aaa38a8e525fd89ba95e39ba7435`