|
|
|
@ -425,13 +425,14 @@ struct vk_device_struct {
|
|
|
|
|
vk_pipeline pipeline_div_norepeat[2][2][2];
|
|
|
|
|
|
|
|
|
|
vk_pipeline pipeline_concat_f32, pipeline_concat_f16, pipeline_concat_i32;
|
|
|
|
|
vk_pipeline pipeline_upscale_f32;
|
|
|
|
|
vk_pipeline pipeline_upscale_nearest_f32, pipeline_upscale_bilinear_f32, pipeline_upscale_bilinear_ac_f32;
|
|
|
|
|
vk_pipeline pipeline_scale_f32;
|
|
|
|
|
vk_pipeline pipeline_sqr_f32;
|
|
|
|
|
vk_pipeline pipeline_sin_f32;
|
|
|
|
|
vk_pipeline pipeline_cos_f32;
|
|
|
|
|
vk_pipeline pipeline_clamp_f32;
|
|
|
|
|
vk_pipeline pipeline_pad_f32;
|
|
|
|
|
vk_pipeline pipeline_roll_f32;
|
|
|
|
|
vk_pipeline pipeline_repeat_f32, pipeline_repeat_back_f32;
|
|
|
|
|
vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16, pipeline_cpy_f16_f32, pipeline_cpy_f32_bf16;
|
|
|
|
|
vk_pipeline pipeline_contig_cpy_f32_f32, pipeline_contig_cpy_f32_f16, pipeline_contig_cpy_f16_f16, pipeline_contig_cpy_f16_f32, pipeline_contig_cpy_f32_bf16;
|
|
|
|
@ -694,6 +695,37 @@ struct vk_op_unary_push_constants {
|
|
|
|
|
};
|
|
|
|
|
static_assert(sizeof(vk_op_unary_push_constants) <= 128, "sizeof(vk_op_unary_push_constants) must be <= 128");
|
|
|
|
|
|
|
|
|
|
static vk_op_unary_push_constants vk_op_unary_push_constants_init(const ggml_tensor * src0, const ggml_tensor * dst, int64_t ne = 0) {
|
|
|
|
|
GGML_ASSERT(ne != 0 || (ggml_nelements(src0) == ggml_nelements(dst)));
|
|
|
|
|
ne = ne != 0 ? ne : ggml_nelements(dst);
|
|
|
|
|
GGML_ASSERT(ne <= (int64_t)std::numeric_limits<uint32_t>::max());
|
|
|
|
|
|
|
|
|
|
vk_op_unary_push_constants p{};
|
|
|
|
|
p.ne = (uint32_t)ne;
|
|
|
|
|
|
|
|
|
|
size_t src0_tsize = ggml_type_size(src0->type);
|
|
|
|
|
p.ne00 = (uint32_t)src0->ne[0];
|
|
|
|
|
p.ne01 = (uint32_t)src0->ne[1];
|
|
|
|
|
p.ne02 = (uint32_t)src0->ne[2];
|
|
|
|
|
p.ne03 = (uint32_t)src0->ne[3];
|
|
|
|
|
p.nb00 = (uint32_t)(src0->nb[0] / src0_tsize);
|
|
|
|
|
p.nb01 = (uint32_t)(src0->nb[1] / src0_tsize);
|
|
|
|
|
p.nb02 = (uint32_t)(src0->nb[2] / src0_tsize);
|
|
|
|
|
p.nb03 = (uint32_t)(src0->nb[3] / src0_tsize);
|
|
|
|
|
|
|
|
|
|
size_t dst_tsize = ggml_type_size(dst->type);
|
|
|
|
|
p.ne10 = (uint32_t)dst->ne[0];
|
|
|
|
|
p.ne11 = (uint32_t)dst->ne[1];
|
|
|
|
|
p.ne12 = (uint32_t)dst->ne[2];
|
|
|
|
|
p.ne13 = (uint32_t)dst->ne[3];
|
|
|
|
|
p.nb10 = (uint32_t)(dst->nb[0] / dst_tsize);
|
|
|
|
|
p.nb11 = (uint32_t)(dst->nb[1] / dst_tsize);
|
|
|
|
|
p.nb12 = (uint32_t)(dst->nb[2] / dst_tsize);
|
|
|
|
|
p.nb13 = (uint32_t)(dst->nb[3] / dst_tsize);
|
|
|
|
|
|
|
|
|
|
return p; // fastdiv values and offsets are initialized later in ggml_vk_op
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// See https://gmplib.org/~tege/divcnst-pldi94.pdf figure 4.1.
|
|
|
|
|
// Precompute mp (m' in the paper) and L such that division
|
|
|
|
|
// can be computed using a multiply (high 32b of 64b result)
|
|
|
|
@ -863,6 +895,7 @@ struct vk_op_conv2d_dw_push_constants {
|
|
|
|
|
|
|
|
|
|
struct vk_op_upscale_push_constants {
|
|
|
|
|
uint32_t ne; uint32_t a_offset; uint32_t d_offset;
|
|
|
|
|
uint32_t ne00; uint32_t ne01;
|
|
|
|
|
uint32_t nb00; uint32_t nb01; uint32_t nb02; uint32_t nb03;
|
|
|
|
|
uint32_t ne10; uint32_t ne11; uint32_t ne12; uint32_t ne13;
|
|
|
|
|
float sf0; float sf1; float sf2; float sf3;
|
|
|
|
@ -2824,7 +2857,9 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_concat_f16, "concat_f16", concat_f16_len, concat_f16_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_concat_i32, "concat_i32", concat_i32_len, concat_i32_data, "main", 3, sizeof(vk_op_binary_push_constants), {512, 1, 1}, {}, 1);
|
|
|
|
|
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_upscale_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {}, 1);
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_upscale_nearest_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_NEAREST}, 1);
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR}, 1);
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_upscale_bilinear_ac_f32, "upscale_f32", upscale_f32_len, upscale_f32_data, "main", 2, sizeof(vk_op_upscale_push_constants), {512, 1, 1}, {GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS}, 1);
|
|
|
|
|
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
|
|
|
|
|
|
|
@ -2836,6 +2871,8 @@ static void ggml_vk_load_shaders(vk_device& device) {
|
|
|
|
|
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_pad_f32, "pad_f32", pad_f32_len, pad_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
|
|
|
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_roll_f32, "roll_f32", roll_f32_len, roll_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
|
|
|
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_repeat_f32, "repeat_f32", repeat_f32_len, repeat_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
|
|
|
ggml_vk_create_pipeline(device, device->pipeline_repeat_back_f32, "repeat_back_f32", repeat_back_f32_len, repeat_back_f32_data, "main", 2, sizeof(vk_op_unary_push_constants), {512, 1, 1}, {}, 1);
|
|
|
|
|
|
|
|
|
@ -6502,8 +6539,16 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
|
|
}
|
|
|
|
|
return nullptr;
|
|
|
|
|
case GGML_OP_UPSCALE:
|
|
|
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32 && dst->op_params[0] == GGML_SCALE_MODE_NEAREST) {
|
|
|
|
|
return ctx->device->pipeline_upscale_f32;
|
|
|
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
|
|
|
int mode = ggml_get_op_params_i32(dst, 0);
|
|
|
|
|
switch (mode) {
|
|
|
|
|
case GGML_SCALE_MODE_NEAREST:
|
|
|
|
|
return ctx->device->pipeline_upscale_nearest_f32;
|
|
|
|
|
case GGML_SCALE_MODE_BILINEAR:
|
|
|
|
|
return ctx->device->pipeline_upscale_bilinear_f32;
|
|
|
|
|
case GGML_SCALE_MODE_BILINEAR | GGML_SCALE_FLAG_ALIGN_CORNERS:
|
|
|
|
|
return ctx->device->pipeline_upscale_bilinear_ac_f32;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return nullptr;
|
|
|
|
|
case GGML_OP_SCALE:
|
|
|
|
@ -6536,6 +6581,11 @@ static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
|
|
|
|
|
return ctx->device->pipeline_pad_f32;
|
|
|
|
|
}
|
|
|
|
|
return nullptr;
|
|
|
|
|
case GGML_OP_ROLL:
|
|
|
|
|
if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
|
|
|
|
return ctx->device->pipeline_roll_f32;
|
|
|
|
|
}
|
|
|
|
|
return nullptr;
|
|
|
|
|
case GGML_OP_REPEAT:
|
|
|
|
|
if (ggml_type_size(src0->type) == sizeof(float) && ggml_type_size(dst->type) == sizeof(float)) {
|
|
|
|
|
return ctx->device->pipeline_repeat_f32;
|
|
|
|
@ -7085,6 +7135,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
|
|
case GGML_OP_COS:
|
|
|
|
|
case GGML_OP_CLAMP:
|
|
|
|
|
case GGML_OP_PAD:
|
|
|
|
|
case GGML_OP_ROLL:
|
|
|
|
|
case GGML_OP_REPEAT:
|
|
|
|
|
case GGML_OP_REPEAT_BACK:
|
|
|
|
|
case GGML_OP_CPY:
|
|
|
|
@ -7546,14 +7597,21 @@ static void ggml_vk_concat(ggml_backend_vk_context * ctx, vk_context& subctx, co
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
|
const uint32_t mode = (uint32_t)ggml_get_op_params_i32(dst, 0);
|
|
|
|
|
|
|
|
|
|
const float sf0 = (float)dst->ne[0] / src0->ne[0];
|
|
|
|
|
const float sf1 = (float)dst->ne[1] / src0->ne[1];
|
|
|
|
|
const float sf2 = (float)dst->ne[2] / src0->ne[2];
|
|
|
|
|
const float sf3 = (float)dst->ne[3] / src0->ne[3];
|
|
|
|
|
float sf0 = (float)dst->ne[0] / src0->ne[0];
|
|
|
|
|
float sf1 = (float)dst->ne[1] / src0->ne[1];
|
|
|
|
|
float sf2 = (float)dst->ne[2] / src0->ne[2];
|
|
|
|
|
float sf3 = (float)dst->ne[3] / src0->ne[3];
|
|
|
|
|
|
|
|
|
|
if (mode & GGML_SCALE_FLAG_ALIGN_CORNERS) {
|
|
|
|
|
sf0 = (float)(dst->ne[0] - 1) / (src0->ne[0] - 1);
|
|
|
|
|
sf1 = (float)(dst->ne[1] - 1) / (src0->ne[1] - 1);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32<vk_op_upscale_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_UPSCALE, {
|
|
|
|
|
(uint32_t)ggml_nelements(dst), 0, 0,
|
|
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1],
|
|
|
|
|
(uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
|
|
|
(uint32_t)dst->ne[0], (uint32_t)dst->ne[1], (uint32_t)dst->ne[2],(uint32_t)dst->ne[3],
|
|
|
|
|
sf0, sf1, sf2, sf3,
|
|
|
|
@ -7561,123 +7619,64 @@ static void ggml_vk_upscale(ggml_backend_vk_context * ctx, vk_context& subctx, c
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_scale(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
float * op_params = (float *)dst->op_params;
|
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
|
|
|
|
p.param1 = ggml_get_op_params_f32(dst, 0);
|
|
|
|
|
p.param2 = ggml_get_op_params_f32(dst, 1);
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, {
|
|
|
|
|
(uint32_t)ggml_nelements(src0),
|
|
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
|
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
|
0,
|
|
|
|
|
op_params[0], op_params[1],
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
|
}, dryrun);
|
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SCALE, std::move(p), dryrun);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_sqr(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, {
|
|
|
|
|
(uint32_t)ggml_nelements(src0),
|
|
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
|
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
|
0,
|
|
|
|
|
0.0f, 0.0f,
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
|
}, dryrun);
|
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SQR, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_sin(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, {
|
|
|
|
|
(uint32_t)ggml_nelements(src0),
|
|
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
|
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
|
0,
|
|
|
|
|
0.0f, 0.0f,
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
|
}, dryrun);
|
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_SIN, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_cos(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, {
|
|
|
|
|
(uint32_t)ggml_nelements(src0),
|
|
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
|
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
|
0,
|
|
|
|
|
0.0f, 0.0f,
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
|
}, dryrun);
|
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_COS, vk_op_unary_push_constants_init(src0, dst), dryrun);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_clamp(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
float * op_params = (float *)dst->op_params;
|
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
|
|
|
|
p.param1 = ggml_get_op_params_f32(dst, 0);
|
|
|
|
|
p.param2 = ggml_get_op_params_f32(dst, 1);
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, {
|
|
|
|
|
(uint32_t)ggml_nelements(src0),
|
|
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
|
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
|
0,
|
|
|
|
|
op_params[0], op_params[1],
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
|
}, dryrun);
|
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CLAMP, std::move(p), dryrun);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_pad(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, std::move(p), dryrun);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_PAD, {
|
|
|
|
|
(uint32_t)ggml_nelements(dst),
|
|
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
|
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
|
0,
|
|
|
|
|
0.0f, 0.0f,
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
|
}, dryrun);
|
|
|
|
|
static void ggml_vk_roll(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
const int32_t s0 = ggml_get_op_params_i32(dst, 0);
|
|
|
|
|
const int32_t s1 = ggml_get_op_params_i32(dst, 1);
|
|
|
|
|
const int32_t s2 = ggml_get_op_params_i32(dst, 2);
|
|
|
|
|
const int32_t s3 = ggml_get_op_params_i32(dst, 3);
|
|
|
|
|
const uint32_t s01_packed = ((s0 + 0x8000) << 16) | (s1 + 0x8000);
|
|
|
|
|
const uint32_t s23_packed = ((s2 + 0x8000) << 16) | (s3 + 0x8000);
|
|
|
|
|
|
|
|
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst);
|
|
|
|
|
memcpy(&p.param1, &s01_packed, sizeof(float));
|
|
|
|
|
memcpy(&p.param2, &s23_packed, sizeof(float));
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_ROLL, std::move(p), dryrun);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_repeat(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, {
|
|
|
|
|
(uint32_t)ggml_nelements(dst),
|
|
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
|
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
|
0,
|
|
|
|
|
0.0f, 0.0f,
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
|
}, dryrun);
|
|
|
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT, std::move(p), dryrun);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_repeat_back(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, {
|
|
|
|
|
(uint32_t)ggml_nelements(dst),
|
|
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
|
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
|
0,
|
|
|
|
|
0.0f, 0.0f,
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
|
}, dryrun);
|
|
|
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ggml_nelements(dst));
|
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_REPEAT_BACK, std::move(p), dryrun);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
|
const uint32_t src0_type_size = ggml_type_size(src0->type);
|
|
|
|
|
const uint32_t dst_type_size = ggml_type_size(dst->type);
|
|
|
|
|
|
|
|
|
|
uint32_t ne = (uint32_t)ggml_nelements(src0);
|
|
|
|
|
if (ggml_is_quantized(src0->type) && ggml_is_quantized(dst->type)) {
|
|
|
|
|
// Convert from number of logical elements to 2- or 4-byte units.
|
|
|
|
@ -7689,14 +7688,8 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context& subctx, const
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ggml_vk_op_f32<vk_op_unary_push_constants>(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, {
|
|
|
|
|
ne,
|
|
|
|
|
(uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->ne[2], (uint32_t)src0->ne[3], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, (uint32_t)src0->nb[3] / src0_type_size,
|
|
|
|
|
(uint32_t) dst->ne[0], (uint32_t) dst->ne[1], (uint32_t) dst->ne[2], (uint32_t) dst->ne[3], (uint32_t) dst->nb[0] / dst_type_size, (uint32_t) dst->nb[1] / dst_type_size, (uint32_t) dst->nb[2] / dst_type_size, (uint32_t) dst->nb[3] / dst_type_size,
|
|
|
|
|
0,
|
|
|
|
|
0.0f, 0.0f,
|
|
|
|
|
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
|
|
|
|
|
}, dryrun);
|
|
|
|
|
vk_op_unary_push_constants p = vk_op_unary_push_constants_init(src0, dst, ne);
|
|
|
|
|
ggml_vk_op_f32(ctx, subctx, src0, nullptr, nullptr, dst, GGML_OP_CPY, std::move(p), dryrun);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void ggml_vk_set_rows(ggml_backend_vk_context * ctx, vk_context& subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, bool dryrun = false) {
|
|
|
|
@ -9033,6 +9026,7 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|
|
|
|
case GGML_OP_COS:
|
|
|
|
|
case GGML_OP_CLAMP:
|
|
|
|
|
case GGML_OP_PAD:
|
|
|
|
|
case GGML_OP_ROLL:
|
|
|
|
|
case GGML_OP_CPY:
|
|
|
|
|
case GGML_OP_SET_ROWS:
|
|
|
|
|
case GGML_OP_CONT:
|
|
|
|
@ -9204,6 +9198,10 @@ static bool ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_cgraph * cgr
|
|
|
|
|
case GGML_OP_PAD:
|
|
|
|
|
ggml_vk_pad(ctx, compute_ctx, src0, node, dryrun);
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
case GGML_OP_ROLL:
|
|
|
|
|
ggml_vk_roll(ctx, compute_ctx, src0, node, dryrun);
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
case GGML_OP_CPY:
|
|
|
|
|
case GGML_OP_CONT:
|
|
|
|
@ -9428,6 +9426,7 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_cgraph *
|
|
|
|
|
case GGML_OP_COS:
|
|
|
|
|
case GGML_OP_CLAMP:
|
|
|
|
|
case GGML_OP_PAD:
|
|
|
|
|
case GGML_OP_ROLL:
|
|
|
|
|
case GGML_OP_CPY:
|
|
|
|
|
case GGML_OP_SET_ROWS:
|
|
|
|
|
case GGML_OP_CONT:
|
|
|
|
@ -10594,13 +10593,12 @@ static bool ggml_backend_vk_device_supports_op(ggml_backend_dev_t dev, const ggm
|
|
|
|
|
case GGML_OP_CLAMP:
|
|
|
|
|
return op->src[0]->type == GGML_TYPE_F32;
|
|
|
|
|
case GGML_OP_UPSCALE:
|
|
|
|
|
return op->op_params[0] == GGML_SCALE_MODE_NEAREST;
|
|
|
|
|
case GGML_OP_ACC:
|
|
|
|
|
case GGML_OP_CONCAT:
|
|
|
|
|
case GGML_OP_SCALE:
|
|
|
|
|
case GGML_OP_PAD:
|
|
|
|
|
case GGML_OP_ROLL:
|
|
|
|
|
case GGML_OP_DIAG_MASK_INF:
|
|
|
|
|
return true;
|
|
|
|
|
case GGML_OP_SOFT_MAX:
|
|
|
|
|
case GGML_OP_SOFT_MAX_BACK:
|
|
|
|
|
case GGML_OP_ARGSORT:
|
|
|
|
|