mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-01-27 16:29:19 +01:00
ggml: Add POOL2D OP for GPU acceleration to the Vulkan backend in the MobileVLM model. (llama/9763)
* ggml: Add POOL2D OP for GPU ACC to the Vulkan. - The MobileVLM model now supports inference acceleration through GPU by utilizing the Vulkan backend. - A GGML_OP_POOL_2D shader has been added. (Pooling) - The encoding performance of the CLIP model improved from 2.8s on the CPU to 0.7s on the GPU. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> * [fix] Correct the incorrect order of the parameters. fix casting to int. Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com> --------- Signed-off-by: Changyeon Kim <cyzero.kim@samsung.com>
This commit is contained in:
parent
fbc9a05ddf
commit
307712a903
@ -213,6 +213,7 @@ struct vk_device_struct {
|
||||
vk_pipeline pipeline_sum_rows_f32;
|
||||
vk_pipeline pipeline_im2col_f32, pipeline_im2col_f32_f16;
|
||||
vk_pipeline pipeline_timestep_embedding_f32;
|
||||
vk_pipeline pipeline_pool2d_f32;
|
||||
|
||||
std::unordered_map<std::string, vk_pipeline_ref> pipelines;
|
||||
std::unordered_map<std::string, uint64_t> pipeline_descriptor_set_requirements;
|
||||
@ -403,6 +404,17 @@ struct vk_op_timestep_embedding_push_constants {
|
||||
uint32_t max_period;
|
||||
};
|
||||
|
||||
struct vk_op_pool2d_push_constants {
|
||||
uint32_t IW; uint32_t IH;
|
||||
uint32_t OW; uint32_t OH;
|
||||
uint32_t OC;
|
||||
uint32_t pelements;
|
||||
uint32_t op;
|
||||
int32_t k0; int32_t k1;
|
||||
int32_t s0; int32_t s1;
|
||||
int32_t p0; int32_t p1;
|
||||
};
|
||||
|
||||
// Allow pre-recording command buffers
|
||||
struct vk_staging_memcpy {
|
||||
vk_staging_memcpy(void * _dst, const void * _src, size_t _n) : dst(_dst), src(_src), n(_n) {}
|
||||
@ -1803,6 +1815,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_timestep_embedding_f32, "timestep_embedding_f32", timestep_embedding_f32_len, timestep_embedding_f32_data, "main", 2, sizeof(vk_op_timestep_embedding_push_constants), {256, 1, 1}, {}, 1);
|
||||
|
||||
ggml_vk_create_pipeline(device, device->pipeline_pool2d_f32, "pool2d_f32", pool2d_f32_len, pool2d_f32_data, "main", 2, sizeof(vk_op_pool2d_push_constants), {512, 1, 1}, {}, 1);
|
||||
|
||||
for (auto &c : compiles) {
|
||||
c.wait();
|
||||
}
|
||||
@ -4234,6 +4248,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
||||
return ctx->device->pipeline_timestep_embedding_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_POOL_2D:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_pool2d_f32;
|
||||
}
|
||||
return nullptr;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
return ctx->device->pipeline_leaky_relu_f32;
|
||||
@ -4464,6 +4483,14 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
||||
uint32_t half_ceil = (dim + 1) / 2;
|
||||
elements = { half_ceil, (uint32_t)src0->ne[0], 1 };
|
||||
} break;
|
||||
case GGML_OP_POOL_2D:
|
||||
{
|
||||
const uint32_t N = dst->ne[3];
|
||||
const uint32_t OC = dst->ne[2];
|
||||
const uint32_t OH = dst->ne[1];
|
||||
const uint32_t OW = dst->ne[0];
|
||||
elements = { N * OC * OH * OW, 1, 1};
|
||||
} break;
|
||||
case GGML_OP_ADD:
|
||||
case GGML_OP_DIV:
|
||||
case GGML_OP_MUL:
|
||||
@ -4914,6 +4941,34 @@ static void ggml_vk_timestep_embedding(ggml_backend_vk_context * ctx, vk_context
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_pool_2d(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
uint32_t op = static_cast<uint32_t>(dst->op_params[0]);
|
||||
const int32_t k1 = dst->op_params[1];
|
||||
const int32_t k0 = dst->op_params[2];
|
||||
const int32_t s1 = dst->op_params[3];
|
||||
const int32_t s0 = dst->op_params[4];
|
||||
const int32_t p1 = dst->op_params[5];
|
||||
const int32_t p0 = dst->op_params[6];
|
||||
|
||||
const uint32_t IH = src0->ne[1];
|
||||
const uint32_t IW = src0->ne[0];
|
||||
|
||||
const uint32_t N = dst->ne[3];
|
||||
|
||||
const uint32_t OC = dst->ne[2];
|
||||
const uint32_t OH = dst->ne[1];
|
||||
const uint32_t OW = dst->ne[0];
|
||||
|
||||
const uint32_t parallel_elements = N * OC * OH * OW;
|
||||
|
||||
ggml_vk_op_f32<vk_op_pool2d_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_POOL_2D, {
|
||||
IW, IH, OW, OH, OC,
|
||||
parallel_elements,
|
||||
op,
|
||||
k0, k1, s0, s1, p0, p1,
|
||||
}, dryrun);
|
||||
}
|
||||
|
||||
static void ggml_vk_leaky_relu(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
||||
const float * op_params = (const float *)dst->op_params;
|
||||
ggml_vk_op_f32<vk_op_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_LEAKY_RELU, { (uint32_t)ggml_nelements(src0), 0, op_params[0], 0.0f }, dryrun);
|
||||
@ -5792,6 +5847,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
break;
|
||||
default:
|
||||
@ -5927,6 +5983,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
ggml_vk_timestep_embedding(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_POOL_2D:
|
||||
ggml_vk_pool_2d(ctx, compute_ctx, src0, node, dryrun);
|
||||
|
||||
break;
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
ggml_vk_leaky_relu(ctx, compute_ctx, src0, node, dryrun);
|
||||
@ -6018,6 +6078,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_tensor *
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_REPEAT:
|
||||
buf = tensor->buffer;
|
||||
@ -6821,6 +6882,7 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
||||
case GGML_OP_SUM_ROWS:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_POOL_2D:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
return true;
|
||||
default:
|
||||
@ -7334,6 +7396,16 @@ static void ggml_vk_check_results_0(ggml_tensor * tensor) {
|
||||
const int32_t dim = tensor->op_params[0];
|
||||
const int32_t max_period = tensor->op_params[1];
|
||||
tensor_clone = ggml_timestep_embedding(ggml_ctx, src0_clone, dim, max_period);
|
||||
} else if (tensor->op == GGML_OP_POOL_2D) {
|
||||
enum ggml_op_pool op = static_cast<ggml_op_pool>(dst->op_params[0]);
|
||||
const int32_t k0 = tensor->op_params[1];
|
||||
const int32_t k1 = tensor->op_params[2];
|
||||
const int32_t s0 = tensor->op_params[3];
|
||||
const int32_t s1 = tensor->op_params[4];
|
||||
const int32_t p0 = tensor->op_params[5];
|
||||
const int32_t p1 = tensor->op_params[6];
|
||||
|
||||
tensor_clone = ggml_pool_2d(ggml_ctx, src0_clone, op, k0, k1, s0, s1, p0, p1);
|
||||
} else if (tensor->op == GGML_OP_LEAKY_RELU) {
|
||||
const float * op_params = (const float *)tensor->op_params;
|
||||
tensor_clone = ggml_leaky_relu(ggml_ctx, src0_clone, op_params[0], false);
|
||||
|
74
ggml/src/vulkan-shaders/pool2d.comp
Normal file
74
ggml/src/vulkan-shaders/pool2d.comp
Normal file
@ -0,0 +1,74 @@
|
||||
#version 450
|
||||
|
||||
#include "types.comp"
|
||||
|
||||
#extension GL_EXT_shader_16bit_storage : require
|
||||
|
||||
layout(push_constant) uniform parameter {
|
||||
uint IW; uint IH;
|
||||
uint OW; uint OH;
|
||||
uint OC;
|
||||
uint pelements;
|
||||
uint op;
|
||||
int k0; int k1;
|
||||
int s0; int s1;
|
||||
int p0; int p1;
|
||||
} p;
|
||||
|
||||
#define BLOCK_SIZE 512
|
||||
#define FLT_MAX 3.402823466e+38F
|
||||
#define OP_POOL_MAX 0u
|
||||
#define OP_POOL_AVG 1u
|
||||
|
||||
layout (local_size_x = BLOCK_SIZE, local_size_y = 1, local_size_z = 1) in;
|
||||
|
||||
layout(binding = 0) readonly buffer X {A_TYPE data_a[];};
|
||||
layout(binding = 1) writeonly buffer D {D_TYPE data_d[];};
|
||||
|
||||
void main() {
|
||||
const uint idx = gl_GlobalInvocationID.x;
|
||||
if (idx >= p.pelements) {
|
||||
return;
|
||||
}
|
||||
|
||||
const uint O_HW = p.OW * p.OH;
|
||||
|
||||
const uint nc = idx / O_HW;
|
||||
const uint cur_oh = (idx % O_HW) / p.OW;
|
||||
const uint cur_ow = (idx % O_HW) % p.OW;
|
||||
|
||||
const int start_h = int(cur_oh) * p.s0 - p.p0;
|
||||
const uint bh = max(start_h, 0);
|
||||
const uint eh = min(start_h + p.k0, p.IH);
|
||||
|
||||
const int start_w = int(cur_ow) * p.s1 - p.p1;
|
||||
const uint bw = max(start_w, 0);
|
||||
const uint ew = min(start_w + p.k1, p.IW);
|
||||
|
||||
const float scale = 1.0 / float(p.k0 * p.k1);
|
||||
float res;
|
||||
|
||||
if (p.op == OP_POOL_AVG) {
|
||||
res = 0.0;
|
||||
} else if (p.op == OP_POOL_MAX) {
|
||||
res = -FLT_MAX;
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (uint i = bh; i < eh; i++) {
|
||||
#pragma unroll
|
||||
for (uint j = bw; j < ew; j++) {
|
||||
const float cur = D_TYPE(data_a[nc * p.IH * p.IW + i * p.IW + j]);
|
||||
|
||||
if (p.op == OP_POOL_AVG) {
|
||||
res += cur * scale;
|
||||
} else if (p.op == OP_POOL_MAX) {
|
||||
res = max(res, cur);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
data_d[nc * O_HW + cur_oh * p.OW + cur_ow] = res;
|
||||
}
|
@ -493,6 +493,10 @@ void process_shaders(std::vector<std::future<void>>& tasks) {
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("timestep_embedding_f32", "timestep_embedding.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
|
||||
tasks.push_back(std::async(std::launch::async, [=] {
|
||||
string_to_spv("pool2d_f32", "pool2d.comp", merge_maps(base_dict, {{"A_TYPE", "float"}, {"D_TYPE", "float"}}));
|
||||
}));
|
||||
}
|
||||
|
||||
void write_output_files() {
|
||||
|
Loading…
Reference in New Issue
Block a user