diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 67c0223c..cbf9783b 100644 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -2587,3 +2588,149 @@ void ggml_cann_step(ggml_backend_cann_context& ctx, ggml_tensor* dst){ ggml_cann_release_resources(ctx, acl_src, acl_dst, alpha); } + +/** + * @brief Performs expert-specific matrix multiplication (MoE) with + * floating-point precision using the CANN backend. + * + * This function executes a matrix multiplication operation tailored for + * Mixture of Experts (MoE) models, where the input tensor is multiplied + * with expert-specific weight matrices. It uses the CANN backend for + * efficient computation and stores the result in the destination tensor `dst`. + * The operation may leverage identity-based optimizations or routing masks + * as part of sparse expert selection. + * + * @param ctx The context for executing CANN backend operations. + * @param dst The destination tensor where the MoE multiplication result + * will be stored. + * + * @note This function assumes floating-point data types and is designed for + * MoE architectures, possibly involving sparse expert routing. + */ +static void ggml_cann_mul_mat_id_fp(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + //dst [M, K, N, 1] + ggml_tensor * src0 = dst->src[0]; //src0 [D, M, A, 1] + ggml_tensor * src1 = dst->src[1]; //src1 [D, B, N, 1], B = K or B = 1 + ggml_tensor * ids = dst->src[2]; //ids [K, N] + + GGML_TENSOR_BINARY_OP_LOCALS + + // copy index from npu to cpu + int64_t n_as = ne02; // A + int64_t n_ids = ids->ne[0]; // K + + std::vector ids_host(ggml_nbytes(ids)); + ggml_cann_async_memcpy(ctx, ids_host.data(), ids->data, ggml_nbytes(ids), + ACL_MEMCPY_DEVICE_TO_HOST); + ACL_CHECK(aclrtSynchronizeStream(ctx.stream())); + + char * src0_original = (char *) src0->data; + char * src1_original = (char *) src1->data; + char * dst_original = (char *) dst->data; + size_t ori_src0_nb[4] = {nb00, nb01, nb02, nb03}; + + // src0 is F16, src1 is F32, dst is F32 + ggml_cann_pool_alloc src0_cast_allocator; + if (src0->type == GGML_TYPE_F16) { + src0_cast_allocator.alloc(ctx.pool(), sizeof(float) * ggml_nelements(src0)); + void* src0_cast_buf = src0_cast_allocator.get(); + + size_t cast_nb[GGML_MAX_DIMS]; + cast_nb[0] = sizeof(float_t); + for (int i = 1; i < GGML_MAX_DIMS; i++) { + cast_nb[i] = cast_nb[i - 1] * src0->ne[i - 1]; + } + + aclTensor* acl_src0_f16 = ggml_cann_create_tensor(src0); + aclTensor* acl_cast = ggml_cann_create_tensor(src0_cast_buf, + ACL_FLOAT, sizeof(float), src0->ne, cast_nb, 4); + GGML_CANN_CALL_ACLNN_OP(ctx, Cast, acl_src0_f16, ACL_FLOAT, acl_cast); + ggml_cann_release_resources(ctx, acl_cast, acl_src0_f16); + + src0_original = (char *) src0_cast_buf; + memcpy(ori_src0_nb, cast_nb, sizeof(ori_src0_nb)); + } + + std::vector src0_tensor_vec; + std::vector src1_tensor_vec; + std::vector dst_tensor_vec; + for (int64_t iid1 = 0; iid1 < ids->ne[1]; iid1++) { + for (int64_t id = 0; id < n_ids; id++) { + // src0_row [M, D] -> weight && permute + int64_t src0_ne[2] = {ne01, ne00}; + size_t src0_nb[2] = {ori_src0_nb[1], ori_src0_nb[0]}; + // src1_row [D, 1] -> input + int64_t src1_ne[2] = {ne10, 1}; + size_t src1_nb[2] = {nb10, nb11}; + // dst_row [M, 1] -> out + int64_t dst_ne[2] = {ne0, 1}; + size_t dst_nb[2] = {nb0, nb1}; + + // expert index + int32_t i02 = *(int32_t *) (ids_host.data() + iid1*ids->nb[1] + id*ids->nb[0]); + GGML_ASSERT(i02 >= 0 && i02 < n_as); + + // If B = 1 (broadcast), always use 0; otherwise, use id. + int64_t i11 = (ne11 == 1 ? 0 : id); + int64_t i12 = iid1; + + int64_t i1 = id; + int64_t i2 = i12; + + void* src0_tmp_ptr = src0_original + i02*ori_src0_nb[2]; + void* src1_tmp_ptr = src1_original + i11*nb11 + i12*nb12; + void* dst_tmp_ptr = dst_original + i1*nb1 + i2*nb2; + + aclTensor* acl_src0 = ggml_cann_create_tensor(src0_tmp_ptr, + ACL_FLOAT, sizeof(float), + src0_ne, src0_nb, 2); + aclTensor* acl_src1 = ggml_cann_create_tensor(src1_tmp_ptr, + ACL_FLOAT, sizeof(float), + src1_ne, src1_nb, 2); + aclTensor* acl_dst = ggml_cann_create_tensor(dst_tmp_ptr, + ACL_FLOAT, sizeof(float), + dst_ne, dst_nb, 2); + + src0_tensor_vec.push_back(acl_src0); + src1_tensor_vec.push_back(acl_src1); + dst_tensor_vec.push_back(acl_dst); + } + } + + // GroupedMatmulV2 required tensor_list.size < 128 + size_t GROUP_SIZE = 128; + std::vector> src0_tensor_vec_vec; + std::vector> src1_tensor_vec_vec; + std::vector> dst_tensor_vec_vec; + + // split and call GroupedMatmulV2 + for (size_t i = 0; i < src0_tensor_vec.size(); i += GROUP_SIZE) { + size_t end = std::min(i + GROUP_SIZE, src0_tensor_vec.size()); + std::vector src0_tensor_vec_split(src0_tensor_vec.begin() + i, src0_tensor_vec.begin() + end); + std::vector src1_tensor_vec_split(src1_tensor_vec.begin() + i, src1_tensor_vec.begin() + end); + std::vector dst_tensor_vec_split(dst_tensor_vec.begin() + i, dst_tensor_vec.begin() + end); + + aclTensorList* src0_tensor_list = aclCreateTensorList(src0_tensor_vec_split.data(), src0_tensor_vec_split.size()); + aclTensorList* src1_tensor_list = aclCreateTensorList(src1_tensor_vec_split.data(), src1_tensor_vec_split.size()); + aclTensorList* dst_tensor_list = aclCreateTensorList(dst_tensor_vec_split.data(), dst_tensor_vec_split.size()); + + GGML_CANN_CALL_ACLNN_OP(ctx, GroupedMatmulV2, src1_tensor_list, src0_tensor_list, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, 0, -1, dst_tensor_list); + + ggml_cann_release_resources(ctx, src0_tensor_list, src1_tensor_list, dst_tensor_list); + } + return; +} + +void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst) { + const enum ggml_type type = dst->src[0]->type; + switch (type) { + case GGML_TYPE_F32: + case GGML_TYPE_F16: + ggml_cann_mul_mat_id_fp(ctx, dst); + break; + default: + GGML_ABORT("Unsupported type for mul_mat_id"); + break; + } +} diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 46235154..15993cce 100644 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -978,6 +978,33 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe } } +/** + * @brief Performs sparse expert-based matrix multiplication using the CANN backend. + * + * @details This function implements a MoE-style batched matrix multiplication, where each input token + * is routed to one or more experts, and each expert corresponds to a specific [D, M] weight matrix + * in the source tensor `src0`. The routing indices are provided via the `ids` tensor. + * + * For each token (from `src1`), the function selects the corresponding expert(s) as specified by `ids`, + * performs the matrix multiplication with the selected expert's weight submatrix (from `src0`), + * and stores the results in `dst`. This operation is optimized and executed on the CANN backend. + * + * Dimensions: + * - src0: [D, M, A, 1], where A is the number of experts + * - src1: [D, B, N, 1], where N is batch size and B is the slot count per sample + * - ids : [K, N], where K is the number of experts each token is routed to + * - dst : [M, K, N, 1], output tensor storing the result of expert × token multiplication + * + * The function handles two main modes: + * - If `ne12 == 1`, a simpler per-token loop is used. + * - TODO: If `ne12 > 1`, grouped multiplication and memory copying is used for efficiency. + * + * @param ctx The CANN context used for operations. + * @param dst The destination tensor where the expert-weighted token outputs are stored. + * Expected to be of shape [M, K, N, 1]. + */ +void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst); + /** * @brief Applies a element-wise operation to two input tensors using the CANN * backend. diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index e2617b06..0cb7bbf1 100644 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1672,7 +1672,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx, ggml_cann_mul_mat(ctx, dst); break; case GGML_OP_MUL_MAT_ID: - return false; + ggml_cann_mul_mat_id(ctx, dst); + break; case GGML_OP_SCALE: ggml_cann_scale(ctx, dst); break; @@ -2030,7 +2031,13 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev, } } case GGML_OP_MUL_MAT_ID: - return false; + switch (op->src[0]->type) { + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return true; + default: + return false; + } // embedding case GGML_OP_GET_ROWS: { switch (op->src[0]->type) {