From 21b01a21b6d5b6d3527c8901c8ad471821b30b97 Mon Sep 17 00:00:00 2001 From: Jeff Bolz Date: Wed, 13 Nov 2024 00:58:57 -0600 Subject: [PATCH] vulkan: Optimize contiguous copies (llama/10254) * tests: Fix memory bandwidth calculation for perf tests Add a flops calculation for flash attention. Add one GGML_OP_CPY perf test. * vulkan: Optimize contiguous copies Add a variant of the copy shader for when the tensors are contiguous. Avoid the complex addressing calculations, and do four elements per invocation to hide some other overhead. Apply similar changes to the scale shader, since scale is always contiguous. Add a "progress bar" for shader compiles. --- ggml/src/ggml-vulkan.cpp | 76 ++++++++++++++----- ggml/src/vulkan-shaders/clamp.comp | 2 + ggml/src/vulkan-shaders/contig_copy.comp | 42 ++++++++++ ggml/src/vulkan-shaders/copy.comp | 2 + ggml/src/vulkan-shaders/cos.comp | 2 + .../vulkan-shaders/generic_unary_head.comp | 3 +- ggml/src/vulkan-shaders/pad.comp | 2 + ggml/src/vulkan-shaders/repeat.comp | 2 + ggml/src/vulkan-shaders/scale.comp | 20 +++-- ggml/src/vulkan-shaders/sin.comp | 2 + ggml/src/vulkan-shaders/square.comp | 2 + .../src/vulkan-shaders/vulkan-shaders-gen.cpp | 3 + 12 files changed, 132 insertions(+), 26 deletions(-) create mode 100644 ggml/src/vulkan-shaders/contig_copy.comp diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp index 308cf47c..169b5a3b 100644 --- a/ggml/src/ggml-vulkan.cpp +++ b/ggml/src/ggml-vulkan.cpp @@ -196,6 +196,7 @@ struct vk_device_struct { vk_pipeline pipeline_pad_f32; vk_pipeline pipeline_repeat_f32; vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; + vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16; vk_pipeline pipeline_norm_f32; vk_pipeline pipeline_group_norm_f32; vk_pipeline pipeline_rms_norm_f32; @@ -722,6 +723,12 @@ static void ggml_vk_create_pipeline_func(vk_device& device, vk_pipeline& pipelin std::lock_guard guard(compile_count_mutex); assert(compile_count > 0); compile_count--; + + // "Progress bar" for shader compiles + static uint32_t total_compile_count = 0; + if ((total_compile_count++ % 10) == 0) { + std::cerr << "."; + } } compile_count_cond.notify_all(); } @@ -1200,6 +1207,8 @@ static void ggml_vk_wait_events(vk_context& ctx, std::vector&& events static void ggml_vk_load_shaders(vk_device& device) { VK_LOG_DEBUG("ggml_vk_load_shaders(" << device->name << ")"); + std::cerr << "ggml_vulkan: Compiling shaders"; + // mulmat std::initializer_list warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; std::initializer_list warptile_m = { 128, 64, 64, 16, device->subgroup_size, 32, 2, 4, 2, device->subgroup_size }; @@ -1759,6 +1768,10 @@ static void ggml_vk_load_shaders(vk_device& device) { ggml_vk_create_pipeline(device, device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f32, "contig_cpy_f32_f32", contig_cpy_f32_f32_len, contig_cpy_f32_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f32_f16, "contig_cpy_f32_f16", contig_cpy_f32_f16_len, contig_cpy_f32_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_contig_cpy_f16_f16, "contig_cpy_f16_f16", contig_cpy_f16_f16_len, contig_cpy_f16_f16_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(device, device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); ggml_vk_create_pipeline(device, device->pipeline_add_f16_f32_f16, "add_f16_f32_f16", add_f16_f32_f16_len, add_f16_f32_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1); @@ -1817,6 +1830,7 @@ static void ggml_vk_load_shaders(vk_device& device) { for (auto &c : compiles) { c.wait(); } + std::cerr << "Done!" << std::endl; } static vk_device ggml_vk_get_device(size_t idx) { @@ -3061,18 +3075,34 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) { - if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) { - return ctx->device->pipeline_cpy_f32_f32; +static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src, const ggml_tensor * dst, ggml_type to) { + + // Choose "contiguous copy" shader if src/dst are contiguous + bool contig = ggml_is_contiguous(src) && (!dst || ggml_is_contiguous(dst)); + + if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F32) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_f32; + } else { + return ctx->device->pipeline_cpy_f32_f32; + } } - if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) { - return ctx->device->pipeline_cpy_f32_f16; + if (src->type == GGML_TYPE_F32 && to == GGML_TYPE_F16) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f32_f16; + } else { + return ctx->device->pipeline_cpy_f32_f16; + } } - if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) { - return ctx->device->pipeline_cpy_f16_f16; + if (src->type == GGML_TYPE_F16 && to == GGML_TYPE_F16) { + if (contig) { + return ctx->device->pipeline_contig_cpy_f16_f16; + } else { + return ctx->device->pipeline_cpy_f16_f16; + } } - std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl; + std::cerr << "Missing CPY op for types: " << ggml_type_name(src->type) << " " << ggml_type_name(to) << std::endl; GGML_ABORT("fatal error"); } @@ -3082,6 +3112,15 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& const int tensor_type_size = ggml_type_size(tensor->type); const uint32_t ne = ggml_nelements(tensor); + std::array elements; + + if (ne > 262144) { + elements = { 512, 512, CEIL_DIV(ne, 262144) }; + } else if (ne > 512) { + elements = { 512, CEIL_DIV(ne, 512), 1 }; + } else { + elements = { ne, 1, 1 }; + } const vk_op_unary_push_constants pc = { (uint32_t)ne, @@ -3091,7 +3130,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context& 0.0f, 0.0f, }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, { ne, 1, 1 }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_unary_push_constants), &pc, elements); } static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) { @@ -3176,12 +3215,12 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context& sub vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16); } else { to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } @@ -3361,10 +3400,10 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context& vk_pipeline to_fp16_vk_0 = nullptr; vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } @@ -3745,12 +3784,12 @@ static void ggml_vk_mul_mat_id_q_f16(ggml_backend_vk_context * ctx, vk_context& vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, GGML_TYPE_F16); } else { to_fp16_vk_0 = ggml_vk_get_to_fp16(ctx, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, GGML_TYPE_F16); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, GGML_TYPE_F16); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } @@ -3938,10 +3977,10 @@ static void ggml_vk_mul_mat_vec_id_q_f16(ggml_backend_vk_context * ctx, vk_conte vk_pipeline to_fp16_vk_0 = nullptr; vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { - to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); + to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0, nullptr, src0->type); } if (y_non_contig) { - to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1->type, src1->type); + to_fp16_vk_1 = ggml_vk_get_cpy_pipeline(ctx, src1, nullptr, src1->type); } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } @@ -4148,7 +4187,7 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const case GGML_OP_CPY: case GGML_OP_CONT: case GGML_OP_DUP: - return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type); + return ggml_vk_get_cpy_pipeline(ctx, src0, dst, dst->type); case GGML_OP_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { return ctx->device->pipeline_norm_f32; @@ -4281,7 +4320,6 @@ static bool ggml_vk_op_supports_incontiguous(ggml_op op) { case GGML_OP_DIV: case GGML_OP_CONCAT: case GGML_OP_UPSCALE: - case GGML_OP_SCALE: case GGML_OP_SQR: case GGML_OP_SIN: case GGML_OP_COS: diff --git a/ggml/src/vulkan-shaders/clamp.comp b/ggml/src/vulkan-shaders/clamp.comp index 7071302a..ae8fa875 100644 --- a/ggml/src/vulkan-shaders/clamp.comp +++ b/ggml/src/vulkan-shaders/clamp.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = get_idx(); diff --git a/ggml/src/vulkan-shaders/contig_copy.comp b/ggml/src/vulkan-shaders/contig_copy.comp new file mode 100644 index 00000000..9acbdd3d --- /dev/null +++ b/ggml/src/vulkan-shaders/contig_copy.comp @@ -0,0 +1,42 @@ +#version 450 + +#include "types.comp" +#include "generic_unary_head.comp" + +#extension GL_EXT_control_flow_attributes : require + +const uint num_threads = 128; + +layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in; + +void main() { + uint idx = get_idx(); + + // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation + const uint num_iter = 4; + + // fast path for when all four iterations are in-bounds + if (idx + (num_iter-1)*num_threads < p.ne) { + [[unroll]] for (uint i = 0; i < num_iter; ++i) { +#ifndef OPTIMIZATION_ERROR_WORKAROUND + data_d[p.d_offset + idx] = D_TYPE(data_a[idx]); +#else + data_d[p.d_offset + idx] = data_a[idx]; +#endif + idx += num_threads; + } + } else { + [[unroll]] for (uint i = 0; i < num_iter; ++i) { + if (idx >= p.ne) { + continue; + } + +#ifndef OPTIMIZATION_ERROR_WORKAROUND + data_d[p.d_offset + idx] = D_TYPE(data_a[idx]); +#else + data_d[p.d_offset + idx] = data_a[idx]; +#endif + idx += num_threads; + } + } +} diff --git a/ggml/src/vulkan-shaders/copy.comp b/ggml/src/vulkan-shaders/copy.comp index c26917c0..2775068f 100644 --- a/ggml/src/vulkan-shaders/copy.comp +++ b/ggml/src/vulkan-shaders/copy.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = get_idx(); diff --git a/ggml/src/vulkan-shaders/cos.comp b/ggml/src/vulkan-shaders/cos.comp index f9a858cb..fbd9d272 100644 --- a/ggml/src/vulkan-shaders/cos.comp +++ b/ggml/src/vulkan-shaders/cos.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = get_idx(); diff --git a/ggml/src/vulkan-shaders/generic_unary_head.comp b/ggml/src/vulkan-shaders/generic_unary_head.comp index eacdefc7..4e1fa3af 100644 --- a/ggml/src/vulkan-shaders/generic_unary_head.comp +++ b/ggml/src/vulkan-shaders/generic_unary_head.comp @@ -1,4 +1,5 @@ #extension GL_EXT_shader_16bit_storage : require +#extension GL_EXT_control_flow_attributes : require layout (push_constant) uniform parameter { @@ -9,8 +10,6 @@ layout (push_constant) uniform parameter float param1; float param2; } p; -layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; - layout (binding = 0) readonly buffer A {A_TYPE data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_d[];}; diff --git a/ggml/src/vulkan-shaders/pad.comp b/ggml/src/vulkan-shaders/pad.comp index a465cd52..e87d8b18 100644 --- a/ggml/src/vulkan-shaders/pad.comp +++ b/ggml/src/vulkan-shaders/pad.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x; diff --git a/ggml/src/vulkan-shaders/repeat.comp b/ggml/src/vulkan-shaders/repeat.comp index a86af87e..c03f737c 100644 --- a/ggml/src/vulkan-shaders/repeat.comp +++ b/ggml/src/vulkan-shaders/repeat.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + uint src0_idx_mod(uint idx) { const uint i13 = idx / (p.ne12*p.ne11*p.ne10); const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10; diff --git a/ggml/src/vulkan-shaders/scale.comp b/ggml/src/vulkan-shaders/scale.comp index 5cd2f668..5cfee8c3 100644 --- a/ggml/src/vulkan-shaders/scale.comp +++ b/ggml/src/vulkan-shaders/scale.comp @@ -3,12 +3,22 @@ #include "types.comp" #include "generic_unary_head.comp" +const uint num_threads = 128; + +layout(local_size_x = num_threads, local_size_y = 1, local_size_z = 1) in; + void main() { - const uint idx = get_idx(); + uint idx = get_idx(); - if (idx >= p.ne) { - return; + // num_threads * num_iter must equal 512, to match the wg_denoms and get_idx calculation + const uint num_iter = 4; + + [[unroll]] for (uint i = 0; i < num_iter; ++i) { + if (idx >= p.ne) { + continue; + } + + data_d[p.d_offset + idx] = D_TYPE(FLOAT_TYPE(data_a[idx]) * FLOAT_TYPE(p.param1)); + idx += num_threads; } - - data_d[p.d_offset + dst_idx(idx)] = D_TYPE(FLOAT_TYPE(data_a[src0_idx(idx)]) * FLOAT_TYPE(p.param1)); } diff --git a/ggml/src/vulkan-shaders/sin.comp b/ggml/src/vulkan-shaders/sin.comp index 7faf9be9..67c48fb9 100644 --- a/ggml/src/vulkan-shaders/sin.comp +++ b/ggml/src/vulkan-shaders/sin.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = get_idx(); diff --git a/ggml/src/vulkan-shaders/square.comp b/ggml/src/vulkan-shaders/square.comp index 1fa118c9..2ff48ddc 100644 --- a/ggml/src/vulkan-shaders/square.comp +++ b/ggml/src/vulkan-shaders/square.comp @@ -3,6 +3,8 @@ #include "types.comp" #include "generic_unary_head.comp" +layout(local_size_x = 512, local_size_y = 1, local_size_z = 1) in; + void main() { const uint idx = get_idx(); diff --git a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp index 477355c2..5c84f473 100644 --- a/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp +++ b/ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp @@ -350,6 +350,9 @@ void process_shaders() { string_to_spv("cpy_f32_f32", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); string_to_spv("cpy_f32_f16", "copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}); string_to_spv("cpy_f16_f16", "copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); + string_to_spv("contig_cpy_f32_f32", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float"}}); + string_to_spv("contig_cpy_f32_f16", "contig_copy.comp", {{"A_TYPE", "float"}, {"D_TYPE", "float16_t"}}); + string_to_spv("contig_cpy_f16_f16", "contig_copy.comp", {{"A_TYPE", "float16_t"}, {"D_TYPE", "float16_t"}, {"OPTIMIZATION_ERROR_WORKAROUND", "1"}}); string_to_spv("add_f32", "add.comp", {{"A_TYPE", "float"}, {"B_TYPE", "float"}, {"D_TYPE", "float"}, {"FLOAT_TYPE", "float"}}); string_to_spv("add_f16_f32_f16", "add.comp", {{"A_TYPE", "float16_t"}, {"B_TYPE", "float"}, {"D_TYPE", "float16_t"}, {"FLOAT_TYPE", "float"}});