musa: Upgrade MUSA SDK version to rc4.0.1 and use mudnn::Unary::IDENTITY op to accelerate D2D memory copy (llama/13647)

* musa: fix build warning (unused parameter)

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* musa: upgrade MUSA SDK version to rc4.0.1

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* musa: use mudnn::Unary::IDENTITY op to accelerate D2D memory copy

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

* Update ggml/src/ggml-cuda/cpy.cu

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* musa: remove MUDNN_CHECK_GEN and use CUDA_CHECK_GEN instead in MUDNN_CHECK

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>

---------

Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
R0CKSTAR 2025-05-21 09:58:49 +08:00 committed by Georgi Gerganov
parent 4712f7b663
commit 131ee546ca
5 changed files with 144 additions and 4 deletions

View File

@ -1,5 +1,8 @@
#include "cpy.cuh"
#include "dequantize.cuh"
#ifdef GGML_USE_MUSA
#include "ggml-musa/mudnn.cuh"
#endif // GGML_USE_MUSA
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
@ -597,7 +600,14 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
#endif
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
#ifdef GGML_USE_MUSA
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
} else
#endif // GGML_USE_MUSA
{
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
}
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream, dest_ptrs_d, graph_cpynode_index);
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_BF16) {

View File

@ -772,7 +772,7 @@ static __device__ __forceinline__ void flash_attn_ext_f16_iter(
GGML_UNUSED(stride_mask); GGML_UNUSED(jt); GGML_UNUSED(tile_K);
GGML_UNUSED(tile_V); GGML_UNUSED(tile_mask); GGML_UNUSED(Q_B);
GGML_UNUSED(VKQ_C); GGML_UNUSED(KQ_max); GGML_UNUSED(KQ_rowsum);
GGML_UNUSED(kb0);
GGML_UNUSED(kb0); GGML_UNUSED(tile_Q);
NO_DEVICE_CODE;
#endif // NEW_MMA_AVAILABLE
}

View File

@ -27,12 +27,15 @@ if (MUSAToolkit_FOUND)
file(GLOB GGML_HEADERS_MUSA "../ggml-cuda/*.cuh")
list(APPEND GGML_HEADERS_MUSA "../../include/ggml-cuda.h")
list(APPEND GGML_HEADERS_MUSA "../ggml-musa/mudnn.cuh")
file(GLOB GGML_SOURCES_MUSA "../ggml-cuda/*.cu")
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
list(APPEND GGML_SOURCES_MUSA ${SRCS})
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_MUSA ${SRCS})
file(GLOB SRCS "../ggml-musa/*.cu")
list(APPEND GGML_SOURCES_MUSA ${SRCS})
if (GGML_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
@ -62,7 +65,9 @@ if (MUSAToolkit_FOUND)
)
# TODO: do not use CUDA definitions for MUSA
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
if (NOT GGML_BACKEND_DL)
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
endif()
add_compile_definitions(GGML_USE_MUSA)
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
@ -92,9 +97,10 @@ if (MUSAToolkit_FOUND)
endif()
if (GGML_STATIC)
# TODO: mudnn has not provided static libraries yet
target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
else()
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
endif()
if (GGML_CUDA_NO_VMM)

112
ggml/src/ggml-musa/mudnn.cu Normal file
View File

@ -0,0 +1,112 @@
#include <mutex>
#include <mudnn.h>
#include "mudnn.cuh"
namespace mudnn = musa::dnn;
// Returns a human-readable error string for mudnn::Status
const char* mudnnGetErrorString(mudnn::Status err) {
switch (err) {
case mudnn::Status::SUCCESS:
return "Success";
case mudnn::Status::INVALID_PARAMETER:
return "Invalid parameter";
case mudnn::Status::NOT_INITIALIZED:
return "Not initialized";
case mudnn::Status::ALLOC_FAILED:
return "Allocation failed";
case mudnn::Status::NOT_SUPPORTED:
return "Not supported";
case mudnn::Status::INTERNAL_ERROR:
return "Internal error";
case mudnn::Status::ARCH_MISMATCH:
return "Architecture mismatch";
case mudnn::Status::EXECUTION_FAILED:
return "Execution failed";
default:
return "Unknown mudnn status";
}
}
// Error checking macro for MUDNN calls
#define MUDNN_CHECK(err) CUDA_CHECK_GEN(err, mudnn::Status::SUCCESS, mudnnGetErrorString)
namespace {
// Thread-safe cache for mudnn::Handle objects per device
std::unordered_map<int, std::unique_ptr<mudnn::Handle>> handle_cache;
std::mutex handle_cache_mutex;
mudnn::Handle* get_cached_handle(int device_id) {
std::lock_guard<std::mutex> lock(handle_cache_mutex);
auto it = handle_cache.find(device_id);
if (it != handle_cache.end()) {
return it->second.get();
}
auto handle = std::make_unique<mudnn::Handle>(device_id);
mudnn::Handle* handle_ptr = handle.get();
handle_cache[device_id] = std::move(handle);
return handle_ptr;
}
}
// Extracts dimensions and strides from a ggml_tensor
int get_ggml_dims_and_strides(const ggml_tensor* tensor,
std::vector<int64_t>& dims,
std::vector<int64_t>& strides) {
const int ndims = ggml_n_dims(tensor);
const size_t element_size = ggml_element_size(tensor);
dims.resize(ndims);
strides.resize(ndims);
for (int i = 0; i < ndims; ++i) {
dims[i] = tensor->ne[i];
strides[i] = tensor->nb[i] / static_cast<int64_t>(element_size);
}
return ndims;
}
// Converts ggml_type to mudnn::Tensor::Type
mudnn::Tensor::Type ggml_type_to_mudnn_type(ggml_type type) {
switch (type) {
case GGML_TYPE_F32:
return mudnn::Tensor::Type::FLOAT;
case GGML_TYPE_F16:
return mudnn::Tensor::Type::HALF;
// TODO: Add support for other types
default:
MUDNN_CHECK(mudnn::Status::NOT_SUPPORTED);
}
return mudnn::Tensor::Type::FLOAT; // Default fallback
}
// Asynchronous memory copy using mudnn::Unary::IDENTITY
musaError_t mudnnMemcpyAsync(ggml_backend_cuda_context& ctx, const ggml_tensor* dst, const ggml_tensor* src) {
mudnn::Tensor tensor_dst, tensor_src;
MUDNN_CHECK(tensor_dst.SetType(ggml_type_to_mudnn_type(dst->type)));
MUDNN_CHECK(tensor_src.SetType(ggml_type_to_mudnn_type(src->type)));
std::vector<int64_t> dims, strides;
const int ndims = get_ggml_dims_and_strides(src, dims, strides);
MUDNN_CHECK(tensor_dst.SetNdInfo(ndims, dims.data(), strides.data()));
MUDNN_CHECK(tensor_src.SetNdInfo(ndims, dims.data(), strides.data()));
MUDNN_CHECK(tensor_dst.SetAddr(dst->data));
MUDNN_CHECK(tensor_src.SetAddr(src->data));
mudnn::Unary op;
MUDNN_CHECK(op.SetMode(mudnn::Unary::Mode::IDENTITY));
MUDNN_CHECK(op.SetAlpha(0.0f));
MUDNN_CHECK(op.SetBeta(0.0f));
mudnn::Handle* handle = get_cached_handle(ctx.device);
MUDNN_CHECK(handle->SetStream(ctx.stream()));
MUDNN_CHECK(op.Run(*handle, tensor_dst, tensor_src));
return musaSuccess;
}

View File

@ -0,0 +1,12 @@
#pragma once
#include "../include/ggml.h"
#include "../ggml-cuda/common.cuh"
// Asynchronously copies data from src tensor to dst tensor using the provided context.
// Returns a musaError_t indicating success or failure.
musaError_t mudnnMemcpyAsync(
ggml_backend_cuda_context &ctx,
const ggml_tensor *dst,
const ggml_tensor *src
);