cuda/cpu: Increase support for fp16 unary operations (ggml/1125)

* Support fp16 unary operations in the CUDA backend

* cpu: increase fp16 support for unary operators in the CPU backend

* cuda: increase fp16 support for unary operators in the CUDA backend

* Add test cases for fp16 unary operators

* metal: update supports_op for unary operators that don't support fp16, to prevent test-backend-ops from failing

* metal: fix PR comments for unary op support after fp16 unary tests
This commit is contained in:
cmdr2 2025-02-28 12:34:39 +05:30 committed by Georgi Gerganov
parent 2e180184a8
commit 60d2ddebdf
6 changed files with 1248 additions and 164 deletions

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,7 @@
#include "clamp.cuh" #include "clamp.cuh"
static __global__ void clamp_f32(const float * x, float * dst, const float min, const float max, const int k) { template <class T>
static __global__ void op_clamp(const T * x, T * dst, const T min, const T max, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
@ -10,25 +11,31 @@ static __global__ void clamp_f32(const float * x, float * dst, const float min,
dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]); dst[i] = x[i] < min ? min : (x[i] > max ? max : x[i]);
} }
static void clamp_f32_cuda(const float * x, float * dst, const float min, const float max, const int k, cudaStream_t stream) { template <class T>
static void clamp_cuda(const T * x, T * dst, const T min, const T max, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE; const int num_blocks = (k + CUDA_CLAMP_BLOCK_SIZE - 1) / CUDA_CLAMP_BLOCK_SIZE;
clamp_f32<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k); op_clamp<<<num_blocks, CUDA_CLAMP_BLOCK_SIZE, 0, stream>>>(x, dst, min, max, k);
} }
void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
float min; float min;
float max; float max;
memcpy(&min, dst->op_params, sizeof(float)); memcpy(&min, dst->op_params, sizeof(float));
memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
clamp_cuda((const half *)src0_d, (half *)dst_d, (half)min, (half)max, ggml_nelements(src0), stream);
} else {
clamp_cuda((const float *)src0_d, (float *)dst_d, (float)min, (float)max, ggml_nelements(src0), stream);
}
} }

View File

@ -2145,6 +2145,12 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
break; break;
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(dst)) { switch (ggml_get_unary_op(dst)) {
case GGML_UNARY_OP_ABS:
ggml_cuda_op_abs(ctx, dst);
break;
case GGML_UNARY_OP_SGN:
ggml_cuda_op_sgn(ctx, dst);
break;
case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_NEG:
ggml_cuda_op_neg(ctx, dst); ggml_cuda_op_neg(ctx, dst);
break; break;
@ -2242,6 +2248,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
ggml_cuda_op_clamp(ctx, dst); ggml_cuda_op_clamp(ctx, dst);
break; break;
case GGML_OP_LOG:
ggml_cuda_op_log(ctx, dst);
break;
case GGML_OP_NONE: case GGML_OP_NONE:
case GGML_OP_RESHAPE: case GGML_OP_RESHAPE:
case GGML_OP_VIEW: case GGML_OP_VIEW:
@ -2960,6 +2969,8 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
switch (op->op) { switch (op->op) {
case GGML_OP_UNARY: case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) { switch (ggml_get_unary_op(op)) {
case GGML_UNARY_OP_ABS:
case GGML_UNARY_OP_SGN:
case GGML_UNARY_OP_NEG: case GGML_UNARY_OP_NEG:
case GGML_UNARY_OP_STEP: case GGML_UNARY_OP_STEP:
case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_GELU:
@ -3166,6 +3177,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
case GGML_OP_SIN: case GGML_OP_SIN:
case GGML_OP_COS: case GGML_OP_COS:
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
case GGML_OP_LOG:
return true; return true;
case GGML_OP_CONT: case GGML_OP_CONT:
return op->src[0]->type != GGML_TYPE_BF16; return op->src[0]->type != GGML_TYPE_BF16;

View File

@ -1,6 +1,29 @@
#include "unary.cuh" #include "unary.cuh"
static __global__ void neg_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_abs(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = fabsf(x[i]);
}
template <class T>
static __global__ void op_sgn(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = (T)(x[i] > (T)0.f ? 1.f : ((x[i] < (T)0.f ? -1.f : 0.f)));
}
template <class T>
static __global__ void op_neg(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
@ -10,61 +33,67 @@ static __global__ void neg_f32(const float * x, float * dst, const int k) {
dst[i] = -x[i]; dst[i] = -x[i];
} }
static __global__ void step_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_step(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
return; return;
} }
dst[i] = x[i] > 0.0f; dst[i] = x[i] > (T)0.0f;
} }
static __global__ void gelu_f32(const float * x, float * dst, const int k) { template <class T>
const float GELU_COEF_A = 0.044715f; static __global__ void op_gelu(const T * x, T * dst, const int k) {
const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; const T GELU_COEF_A = 0.044715f;
const T SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
return; return;
} }
float xi = x[i]; T xi = x[i];
dst[i] = 0.5f*xi*(1.0f + tanhf(SQRT_2_OVER_PI*xi*(1.0f + GELU_COEF_A*xi*xi))); dst[i] = (T)0.5f*xi*((T)1.0f + (T)tanhf(SQRT_2_OVER_PI*xi*((T)1.0f + GELU_COEF_A*xi*xi)));
} }
static __global__ void gelu_quick_f32(const float * x, float * dst, int k) { template <class T>
const float GELU_QUICK_COEF = -1.702f; static __global__ void op_gelu_quick(const T * x, T * dst, int k) {
const T GELU_QUICK_COEF = -1.702f;
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
return; return;
} }
dst[i] = x[i] * (1.0f / (1.0f + expf(GELU_QUICK_COEF * x[i]))); dst[i] = x[i] * ((T)1.0f / ((T)1.0f + (T)expf(GELU_QUICK_COEF * x[i])));
} }
static __global__ void silu_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_silu(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
return; return;
} }
dst[i] = x[i] / (1.0f + expf(-x[i])); dst[i] = x[i] / ((T)1.0f + (T)expf(-x[i]));
} }
static __global__ void silu_back_f32( template <class T>
const float * grad, const float * xf, float * dst, const int k) { static __global__ void op_silu_back(
const T * grad, const T * xf, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
return; return;
} }
const float xfi = xf[i]; const T xfi = xf[i];
const float s = 1.0f / (1.0f + expf(-xfi)); const T s = (T)1.0f / ((T)1.0f + (T)expf(-xfi));
dst[i] = grad[i] * s * (1.0f + xfi * (1.0f - s)); dst[i] = grad[i] * s * ((T)1.0f + xfi * ((T)1.0f - s));
} }
static __global__ void tanh_f32(const float * x, float * dst, int k) { template <class T>
static __global__ void op_tanh(const T * x, T * dst, int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
return; return;
@ -72,7 +101,8 @@ static __global__ void tanh_f32(const float * x, float * dst, int k) {
dst[i] = tanhf(x[i]); dst[i] = tanhf(x[i]);
} }
static __global__ void relu_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_relu(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
@ -81,34 +111,38 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
dst[i] = fmaxf(x[i], 0); dst[i] = fmaxf(x[i], 0);
} }
static __global__ void sigmoid_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_sigmoid(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
return; return;
} }
dst[i] = 1.0f / (1.0f + expf(-x[i])); dst[i] = (T)1.0f / ((T)1.0f + (T)expf(-x[i]));
} }
static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_hardsigmoid(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
return; return;
} }
dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); dst[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + (T)3.0f) / (T)6.0f));
} }
static __global__ void hardswish_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_hardswish(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
return; return;
} }
dst[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); dst[i] = x[i] * (T)fminf(1.0f, fmaxf(0.0f, (x[i] + (T)3.0f) / (T)6.0f));
} }
static __global__ void exp_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_exp(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
@ -117,15 +151,17 @@ static __global__ void exp_f32(const float * x, float * dst, const int k) {
dst[i] = expf(x[i]); dst[i] = expf(x[i]);
} }
static __global__ void leaky_relu_f32(const float * x, float * dst, const int k, const float negative_slope) { template <class T>
static __global__ void op_leaky_relu(const T * x, T * dst, const int k, const float negative_slope) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
return; return;
} }
dst[i] = fmaxf(x[i], 0) + fminf(x[i], 0.0f) * negative_slope; dst[i] = (T)fmaxf(x[i], 0) + (T)fminf(x[i], 0.0f) * (T)negative_slope;
} }
static __global__ void sqr_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_sqr(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
@ -134,7 +170,8 @@ static __global__ void sqr_f32(const float * x, float * dst, const int k) {
dst[i] = x[i] * x[i]; dst[i] = x[i] * x[i];
} }
static __global__ void sqrt_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_sqrt(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
@ -143,7 +180,8 @@ static __global__ void sqrt_f32(const float * x, float * dst, const int k) {
dst[i] = sqrtf(x[i]); dst[i] = sqrtf(x[i]);
} }
static __global__ void sin_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_sin(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
@ -152,7 +190,8 @@ static __global__ void sin_f32(const float * x, float * dst, const int k) {
dst[i] = sinf(x[i]); dst[i] = sinf(x[i]);
} }
static __global__ void cos_f32(const float * x, float * dst, const int k) { template <class T>
static __global__ void op_cos(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x; const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) { if (i >= k) {
@ -161,145 +200,248 @@ static __global__ void cos_f32(const float * x, float * dst, const int k) {
dst[i] = cosf(x[i]); dst[i] = cosf(x[i]);
} }
static void neg_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static __global__ void op_log(const T * x, T * dst, const int k) {
const int i = blockDim.x*blockIdx.x + threadIdx.x;
if (i >= k) {
return;
}
dst[i] = logf(x[i]);
}
template <class T>
static void abs_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE; const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
neg_f32<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_abs<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void step_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void sgn_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
op_sgn<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
template <class T>
static void neg_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1) / CUDA_NEG_BLOCK_SIZE;
op_neg<<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
template <class T>
static void step_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_STEP_BLOCK_SIZE - 1) / CUDA_STEP_BLOCK_SIZE; const int num_blocks = (k + CUDA_STEP_BLOCK_SIZE - 1) / CUDA_STEP_BLOCK_SIZE;
step_f32<<<num_blocks, CUDA_STEP_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_step<<<num_blocks, CUDA_STEP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void gelu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void gelu_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
gelu_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_gelu<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void gelu_quick_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void gelu_quick_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE; const int num_blocks = (k + CUDA_GELU_BLOCK_SIZE - 1) / CUDA_GELU_BLOCK_SIZE;
gelu_quick_f32<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_gelu_quick<<<num_blocks, CUDA_GELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void silu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void silu_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE; const int num_blocks = (k + CUDA_SILU_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
silu_f32<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_silu<<<num_blocks, CUDA_SILU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void silu_back_f32_cuda(const float * grad, const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void silu_back_cuda(const T * grad, const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE; const int num_blocks = (k + CUDA_SILU_BACK_BLOCK_SIZE - 1) / CUDA_SILU_BLOCK_SIZE;
silu_back_f32<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k); op_silu_back<<<num_blocks, CUDA_SILU_BACK_BLOCK_SIZE, 0, stream>>>(grad, x, dst, k);
} }
static void tanh_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void tanh_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE; const int num_blocks = (k + CUDA_TANH_BLOCK_SIZE - 1) / CUDA_TANH_BLOCK_SIZE;
tanh_f32<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_tanh<<<num_blocks, CUDA_TANH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void relu_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_relu<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void sigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void sigmoid_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE; const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
sigmoid_f32<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_sigmoid<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void hardsigmoid_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE; const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_hardsigmoid<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void hardswish_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void hardswish_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE; const int num_blocks = (k + CUDA_HARDSWISH_BLOCK_SIZE - 1) / CUDA_HARDSWISH_BLOCK_SIZE;
hardswish_f32<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_hardswish<<<num_blocks, CUDA_HARDSWISH_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void exp_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void exp_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_EXP_BLOCK_SIZE - 1) / CUDA_EXP_BLOCK_SIZE; const int num_blocks = (k + CUDA_EXP_BLOCK_SIZE - 1) / CUDA_EXP_BLOCK_SIZE;
exp_f32<<<num_blocks, CUDA_EXP_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_exp<<<num_blocks, CUDA_EXP_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void leaky_relu_f32_cuda(const float * x, float * dst, const int k, const float negative_slope, cudaStream_t stream) { template <class T>
static void leaky_relu_cuda(const T * x, T * dst, const int k, const float negative_slope, cudaStream_t stream) {
const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE; const int num_blocks = (k + CUDA_RELU_BLOCK_SIZE - 1) / CUDA_RELU_BLOCK_SIZE;
leaky_relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope); op_leaky_relu<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k, negative_slope);
} }
static void sqr_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void sqr_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE; const int num_blocks = (k + CUDA_SQR_BLOCK_SIZE - 1) / CUDA_SQR_BLOCK_SIZE;
sqr_f32<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_sqr<<<num_blocks, CUDA_SQR_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void sqrt_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void sqrt_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE; const int num_blocks = (k + CUDA_SQRT_BLOCK_SIZE - 1) / CUDA_SQRT_BLOCK_SIZE;
sqrt_f32<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_sqrt<<<num_blocks, CUDA_SQRT_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void sin_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void sin_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE; const int num_blocks = (k + CUDA_SIN_BLOCK_SIZE - 1) / CUDA_SIN_BLOCK_SIZE;
sin_f32<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_sin<<<num_blocks, CUDA_SIN_BLOCK_SIZE, 0, stream>>>(x, dst, k);
} }
static void cos_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) { template <class T>
static void cos_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE; const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
cos_f32<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k); op_cos<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
template <class T>
static void log_cuda(const T * x, T * dst, const int k, cudaStream_t stream) {
const int num_blocks = (k + CUDA_COS_BLOCK_SIZE - 1) / CUDA_COS_BLOCK_SIZE;
op_log<<<num_blocks, CUDA_COS_BLOCK_SIZE, 0, stream>>>(x, dst, k);
}
void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const void * src0_d = src0->data;
void * dst_d = dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
if (src0->type == GGML_TYPE_F16) {
abs_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
abs_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
}
void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const void * src0_d = src0->data;
void * dst_d = dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
if (src0->type == GGML_TYPE_F16) {
sgn_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
sgn_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
neg_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
neg_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
neg_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
step_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
step_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
step_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_gelu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
gelu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
gelu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
gelu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_silu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
silu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
silu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
silu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
@ -314,179 +456,263 @@ void ggml_cuda_op_silu_back(ggml_backend_cuda_context & ctx, ggml_tensor * dst)
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
silu_back_f32_cuda(src0_d, src1_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
silu_back_cuda((const half *)src0_d, (const half *)src1_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
silu_back_cuda((const float*)src0_d, (const float*)src1_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_gelu_quick(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
gelu_quick_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
gelu_quick_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
gelu_quick_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
tanh_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
tanh_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
tanh_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
relu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
relu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
sigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
sigmoid_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
sigmoid_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
hardsigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
hardsigmoid_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
hardsigmoid_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
hardswish_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
hardswish_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
hardswish_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_exp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
exp_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
exp_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
exp_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_leaky_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
float negative_slope; float negative_slope;
memcpy(&negative_slope, dst->op_params, sizeof(float)); memcpy(&negative_slope, dst->op_params, sizeof(float));
leaky_relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), negative_slope, stream); if (src0->type == GGML_TYPE_F16) {
leaky_relu_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), negative_slope, stream);
} else {
leaky_relu_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), negative_slope, stream);
}
} }
void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_sqr(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
sqr_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
sqr_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
sqr_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
sqrt_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
sqrt_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
sqrt_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
sin_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
sin_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
sin_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data; const void * src0_d = src0->data;
float * dst_d = (float *)dst->data; void * dst_d = dst->data;
cudaStream_t stream = ctx.stream(); cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0)); GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
cos_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream); if (src0->type == GGML_TYPE_F16) {
cos_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
cos_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
}
void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const void * src0_d = src0->data;
void * dst_d = dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
GGML_ASSERT(src0->type == dst->type);
if (src0->type == GGML_TYPE_F16) {
log_cuda((const half *)src0_d, (half *)dst_d, ggml_nelements(src0), stream);
} else {
log_cuda((const float *)src0_d, (float *)dst_d, ggml_nelements(src0), stream);
}
} }

View File

@ -16,6 +16,10 @@
#define CUDA_SIN_BLOCK_SIZE 256 #define CUDA_SIN_BLOCK_SIZE 256
#define CUDA_COS_BLOCK_SIZE 256 #define CUDA_COS_BLOCK_SIZE 256
void ggml_cuda_op_abs(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sgn(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_neg(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_step(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
@ -49,3 +53,5 @@ void ggml_cuda_op_sqrt(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_sin(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst); void ggml_cuda_op_cos(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void ggml_cuda_op_log(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View File

@ -1200,7 +1200,7 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_GELU_QUICK:
case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_SILU:
case GGML_UNARY_OP_ELU: case GGML_UNARY_OP_ELU:
return ggml_is_contiguous(op->src[0]); return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
default: default:
return false; return false;
} }
@ -1210,21 +1210,26 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_TRANSPOSE: case GGML_OP_TRANSPOSE:
case GGML_OP_PERMUTE: case GGML_OP_PERMUTE:
case GGML_OP_CONCAT: case GGML_OP_CONCAT:
return true;
case GGML_OP_ADD: case GGML_OP_ADD:
case GGML_OP_SUB: case GGML_OP_SUB:
case GGML_OP_ACC:
case GGML_OP_MUL: case GGML_OP_MUL:
case GGML_OP_DIV: case GGML_OP_DIV:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_ACC:
case GGML_OP_REPEAT: case GGML_OP_REPEAT:
case GGML_OP_SCALE: case GGML_OP_SCALE:
case GGML_OP_CLAMP:
case GGML_OP_CONV_TRANSPOSE_1D: case GGML_OP_CONV_TRANSPOSE_1D:
return true; return true;
case GGML_OP_CLAMP:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_SQR: case GGML_OP_SQR:
case GGML_OP_SQRT: case GGML_OP_SQRT:
case GGML_OP_SIN: case GGML_OP_SIN:
case GGML_OP_COS: case GGML_OP_COS:
return ggml_is_contiguous(op->src[0]); return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_LOG:
return false; // TODO: implement
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
@ -1254,10 +1259,11 @@ static bool ggml_metal_supports_op(const struct ggml_backend_metal_device_contex
case GGML_OP_UPSCALE: case GGML_OP_UPSCALE:
case GGML_OP_PAD: case GGML_OP_PAD:
case GGML_OP_PAD_REFLECT_1D: case GGML_OP_PAD_REFLECT_1D:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING: case GGML_OP_TIMESTEP_EMBEDDING:
case GGML_OP_ARGSORT: case GGML_OP_ARGSORT:
case GGML_OP_LEAKY_RELU: case GGML_OP_LEAKY_RELU:
return op->src[0]->type == GGML_TYPE_F32;
case GGML_OP_ARANGE:
return true; return true;
case GGML_OP_FLASH_ATTN_EXT: case GGML_OP_FLASH_ATTN_EXT:
if (op->src[1]->type != op->src[2]->type) { if (op->src[1]->type != op->src[2]->type) {