mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-08-10 14:18:08 +02:00
opencl: add set_rows
for f16
and f32
(llama/14547)
* opencl: add `set_rows` for `f16` and `f32` * opencl: better choose workgroup size for `set_rows`
This commit is contained in:
@ -88,6 +88,7 @@ set(GGML_OPENCL_KERNELS
|
||||
rms_norm
|
||||
rope
|
||||
scale
|
||||
set_rows
|
||||
sigmoid
|
||||
silu
|
||||
softmax_4_f32
|
||||
|
@ -351,6 +351,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_program program_gemv_noshuffle_general;
|
||||
cl_program program_gemv_noshuffle;
|
||||
cl_program program_get_rows;
|
||||
cl_program program_set_rows;
|
||||
cl_program program_glu;
|
||||
cl_program program_im2col_f16;
|
||||
cl_program program_im2col_f32;
|
||||
@ -412,6 +413,7 @@ struct ggml_backend_opencl_context {
|
||||
cl_kernel kernel_soft_max, kernel_soft_max_4;
|
||||
cl_kernel kernel_soft_max_f16, kernel_soft_max_4_f16;
|
||||
cl_kernel kernel_get_rows_f32, kernel_get_rows_f16, kernel_get_rows_q4_0;
|
||||
cl_kernel kernel_set_rows_f32, kernel_set_rows_f16;
|
||||
cl_kernel kernel_rope_norm_f32, kernel_rope_norm_f16, kernel_rope_neox_f32, kernel_rope_neox_f16;
|
||||
cl_kernel kernel_rope_multi_f32, kernel_rope_multi_f16, kernel_rope_vision_f32, kernel_rope_vision_f16;
|
||||
cl_kernel kernel_cpy_f16_f16, kernel_cpy_f16_f32, kernel_cpy_f32_f16, kernel_cpy_f32_f32;
|
||||
@ -529,6 +531,16 @@ struct ggml_backend_opencl_context {
|
||||
fclose(ftrace);
|
||||
}
|
||||
|
||||
size_t get_kernel_workgroup_size(cl_kernel kernel) const {
|
||||
size_t workgroup_size = 0;
|
||||
size_t ret_size = 0;
|
||||
CL_CHECK(
|
||||
clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE,
|
||||
sizeof(size_t), &workgroup_size, &ret_size));
|
||||
GGML_ASSERT(sizeof(size_t) == ret_size);
|
||||
return workgroup_size;
|
||||
}
|
||||
|
||||
void enqueue_ndrange_kernel(cl_kernel kernel, cl_uint work_dim, size_t *global_work_size, size_t *local_work_size, const ggml_tensor * tensor) {
|
||||
#ifdef GGML_OPENCL_PROFILING
|
||||
cl_event evt;
|
||||
@ -1431,6 +1443,23 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx, ggml_cl_ve
|
||||
}
|
||||
}
|
||||
|
||||
// set_rows
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
const std::string kernel_src {
|
||||
#include "set_rows.cl.h"
|
||||
};
|
||||
#else
|
||||
const std::string kernel_src = read_file("set_rows.cl");
|
||||
#endif
|
||||
backend_ctx->program_set_rows =
|
||||
build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
|
||||
|
||||
CL_CHECK((backend_ctx->kernel_set_rows_f32 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f32", &err), err));
|
||||
CL_CHECK((backend_ctx->kernel_set_rows_f16 = clCreateKernel(backend_ctx->program_set_rows, "kernel_set_rows_f16", &err), err));
|
||||
GGML_LOG_CONT(".");
|
||||
}
|
||||
|
||||
// mul_mv_id_q4_0_f32_8x_flat
|
||||
{
|
||||
#ifdef GGML_OPENCL_EMBED_KERNELS
|
||||
@ -2233,8 +2262,17 @@ static bool ggml_opencl_supports_op(ggml_backend_dev_t dev, const struct ggml_te
|
||||
{
|
||||
// TODO: add support
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14274
|
||||
return false;
|
||||
} break;
|
||||
if (op->src[0]->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
case GGML_OP_CPY:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_CONT:
|
||||
@ -3374,6 +3412,111 @@ static void ggml_cl_get_rows(ggml_backend_t backend, const ggml_tensor * src0, c
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_set_rows(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
GGML_ASSERT(src1);
|
||||
GGML_ASSERT(src1->extra);
|
||||
GGML_ASSERT(dst);
|
||||
GGML_ASSERT(dst->extra);
|
||||
|
||||
// ne0 = ne00
|
||||
// ne2 = ne02
|
||||
// ne3 = ne03
|
||||
|
||||
const int ne01 = src0->ne[1];
|
||||
const int ne02 = src0->ne[2];
|
||||
const int ne03 = src0->ne[3];
|
||||
|
||||
const cl_ulong nb01 = src0->nb[1];
|
||||
const cl_ulong nb02 = src0->nb[2];
|
||||
const cl_ulong nb03 = src0->nb[3];
|
||||
|
||||
const int ne11 = src1->ne[1];
|
||||
const int ne12 = src1->ne[2];
|
||||
|
||||
const cl_ulong nb10 = src1->nb[0];
|
||||
const cl_ulong nb11 = src1->nb[1];
|
||||
const cl_ulong nb12 = src1->nb[2];
|
||||
|
||||
const int ne0 = dst->ne[0];
|
||||
|
||||
const cl_ulong nb1 = dst->nb[1];
|
||||
const cl_ulong nb2 = dst->nb[2];
|
||||
const cl_ulong nb3 = dst->nb[3];
|
||||
|
||||
const int nblk0 = ne0/ggml_blck_size(dst->type);
|
||||
|
||||
ggml_backend_opencl_context *backend_ctx = (ggml_backend_opencl_context *)backend->context;
|
||||
|
||||
ggml_tensor_extra_cl * extra0 = (ggml_tensor_extra_cl *)src0->extra;
|
||||
ggml_tensor_extra_cl * extra1 = (ggml_tensor_extra_cl *)src1->extra;
|
||||
ggml_tensor_extra_cl * extrad = (ggml_tensor_extra_cl *)dst->extra;
|
||||
|
||||
cl_ulong offset0 = extra0->offset + src0->view_offs;
|
||||
cl_ulong offset1 = extra1->offset + src1->view_offs;
|
||||
cl_ulong offsetd = extrad->offset + dst->view_offs;
|
||||
|
||||
cl_kernel kernel;
|
||||
|
||||
switch (dst->type) {
|
||||
case GGML_TYPE_F32:
|
||||
kernel = backend_ctx->kernel_set_rows_f32;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
kernel = backend_ctx->kernel_set_rows_f16;
|
||||
break;
|
||||
default:
|
||||
GGML_ABORT("not implemented");
|
||||
}
|
||||
|
||||
CL_CHECK(clSetKernelArg(kernel, 0, sizeof(cl_mem), &extra0->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 1, sizeof(cl_ulong), &offset0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 2, sizeof(cl_mem), &extra1->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 3, sizeof(cl_ulong), &offset1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 4, sizeof(cl_mem), &extrad->data_device));
|
||||
CL_CHECK(clSetKernelArg(kernel, 5, sizeof(cl_ulong), &offsetd));
|
||||
CL_CHECK(clSetKernelArg(kernel, 6, sizeof(int), &ne01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 7, sizeof(cl_ulong), &nb01));
|
||||
CL_CHECK(clSetKernelArg(kernel, 8, sizeof(cl_ulong), &nb02));
|
||||
CL_CHECK(clSetKernelArg(kernel, 9, sizeof(cl_ulong), &nb03));
|
||||
CL_CHECK(clSetKernelArg(kernel, 10, sizeof(int), &ne11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 11, sizeof(int), &ne12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 12, sizeof(cl_ulong), &nb10));
|
||||
CL_CHECK(clSetKernelArg(kernel, 13, sizeof(cl_ulong), &nb11));
|
||||
CL_CHECK(clSetKernelArg(kernel, 14, sizeof(cl_ulong), &nb12));
|
||||
CL_CHECK(clSetKernelArg(kernel, 15, sizeof(int), &nblk0));
|
||||
CL_CHECK(clSetKernelArg(kernel, 16, sizeof(cl_ulong), &nb1));
|
||||
CL_CHECK(clSetKernelArg(kernel, 17, sizeof(cl_ulong), &nb2));
|
||||
CL_CHECK(clSetKernelArg(kernel, 18, sizeof(cl_ulong), &nb3));
|
||||
|
||||
int nth0 = 64;
|
||||
if (backend_ctx->gpu_family == INTEL) {
|
||||
nth0 = 32;
|
||||
} else if (backend_ctx->gpu_family == ADRENO) {
|
||||
nth0 = 64;
|
||||
}
|
||||
|
||||
int max_workgroup_size = backend_ctx->get_kernel_workgroup_size(kernel);
|
||||
while (nth0 < nblk0 && nth0 < max_workgroup_size) {
|
||||
nth0 *= 2;
|
||||
}
|
||||
|
||||
int rows_per_workgroup = 1;
|
||||
if (nth0 > nblk0) {
|
||||
rows_per_workgroup = nth0 / nblk0;
|
||||
nth0 = nblk0;
|
||||
}
|
||||
|
||||
size_t global_work_size[] = {
|
||||
(size_t)(ne01 + rows_per_workgroup - 1)/rows_per_workgroup*nth0,
|
||||
(size_t)ne02*rows_per_workgroup,
|
||||
(size_t)ne03};
|
||||
size_t local_work_size[] = {(size_t)nth0, (size_t)rows_per_workgroup, 1};
|
||||
|
||||
backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
|
||||
}
|
||||
|
||||
static void ggml_cl_add(ggml_backend_t backend, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0);
|
||||
GGML_ASSERT(src0->extra);
|
||||
@ -6388,6 +6531,12 @@ bool ggml_cl_compute_forward(ggml_backend_t backend, struct ggml_tensor * tensor
|
||||
}
|
||||
func = ggml_cl_get_rows;
|
||||
break;
|
||||
case GGML_OP_SET_ROWS:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
}
|
||||
func = ggml_cl_set_rows;
|
||||
break;
|
||||
case GGML_OP_CPY:
|
||||
if (!any_on_device) {
|
||||
return false;
|
||||
|
95
ggml/src/ggml-opencl/kernels/set_rows.cl
Normal file
95
ggml/src/ggml-opencl/kernels/set_rows.cl
Normal file
@ -0,0 +1,95 @@
|
||||
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
|
||||
|
||||
kernel void kernel_set_rows_f32(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = i03%ne12;
|
||||
int i11 = i02%ne11;
|
||||
|
||||
int i10 = i01;
|
||||
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
global float * dst_row = (global float *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
||||
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
|
||||
dst_row[ind] = (float)src_row[ind];
|
||||
}
|
||||
}
|
||||
|
||||
kernel void kernel_set_rows_f16(
|
||||
global char * src0,
|
||||
ulong offset0,
|
||||
global char * src1,
|
||||
ulong offset1,
|
||||
global char * dst,
|
||||
ulong offsetd,
|
||||
int ne01,
|
||||
ulong nb01,
|
||||
ulong nb02,
|
||||
ulong nb03,
|
||||
int ne11,
|
||||
int ne12,
|
||||
ulong nb10,
|
||||
ulong nb11,
|
||||
ulong nb12,
|
||||
int nblk0,
|
||||
ulong nb1,
|
||||
ulong nb2,
|
||||
ulong nb3
|
||||
) {
|
||||
src0 = src0 + offset0;
|
||||
src1 = src1 + offset1;
|
||||
dst = dst + offsetd;
|
||||
|
||||
int i03 = get_group_id(2);
|
||||
int i02 = get_group_id(1);
|
||||
int i01 = get_group_id(0)*get_local_size(1) + get_local_id(1);
|
||||
|
||||
if (i01 >= ne01) {
|
||||
return;
|
||||
}
|
||||
|
||||
int i12 = i03%ne12;
|
||||
int i11 = i02%ne11;
|
||||
|
||||
int i10 = i01;
|
||||
long i1 = ((global long *)(src1 + i10*nb10 + i11*nb11 + i12*nb12))[0];
|
||||
|
||||
global half * dst_row = (global half *) (dst + i1*nb1 + i02*nb2 + i03*nb3);
|
||||
global float * src_row = (global float *) (src0 + i01*nb01 + i02*nb02 + i03*nb03);
|
||||
|
||||
for (int ind = get_local_id(0); ind < nblk0; ind += get_local_size(0)) {
|
||||
dst_row[ind] = src_row[ind];
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user