hip : Add hipGraph and VMM support to ROCM (llama/11362)

* Add hipGraph support * Enable VMM on rocm
2025-07-11 09:55:06 +02:00 · 2025-01-25 00:02:23 +01:00
parent 9e467815d4
commit 0282ad8fd1
5 changed files with 92 additions and 20 deletions
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@ -154,6 +154,7 @@ option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashA
 option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
 option(GGML_HIP                             "ggml: use HIP"                                   OFF)
 option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
 option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
 option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
 option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@ -588,7 +588,7 @@ struct ggml_tensor_extra_gpu {
 };
-#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
+#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
 #define USE_CUDA_GRAPH
 #endif
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@ -62,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
 [[noreturn]]
 void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
    int id = -1; // in case cudaGetDevice fails
-    cudaGetDevice(&id);
+    (void)cudaGetDevice(&id);
    GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
    GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
    for (int id = 0; id < info.device_count; ++id) {
        int device_vmm = 0;
-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_CUDA_NO_VMM)
        CUdevice device;
        CU_CHECK(cuDeviceGet(&device, id));
        CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
            alloc_prop.location.id = id;
            CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
        }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_CUDA_NO_VMM)
        info.devices[id].vmm = !!device_vmm;
        cudaDeviceProp prop;
@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
 };
 // pool with virtual memory
-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_CUDA_NO_VMM)
 struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@ -309,6 +309,9 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    size_t pool_used = 0;
    size_t pool_size = 0;
    size_t granularity;
 #if defined(GGML_USE_HIP)
    std::vector<std::pair<CUdeviceptr, size_t>> mappings;
 #endif
    explicit ggml_cuda_pool_vmm(int device) :
        device(device),
@ -317,7 +320,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
    ~ggml_cuda_pool_vmm() {
        if (pool_addr != 0) {
 #if defined(GGML_USE_HIP)
            // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
            for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
                CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
            }
 #else
            CU_CHECK(cuMemUnmap(pool_addr, pool_size));
 #endif
            CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
        }
    }
@ -350,7 +360,11 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
            }
            // map at the end of the pool
-            CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
+            CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
            CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
 #if defined(GGML_USE_HIP)
            mappings.push_back({start_ptr, reserve_size});
 #endif
            // the memory allocation handle is no longer needed after mapping
            CU_CHECK(cuMemRelease(handle));
@ -360,7 +374,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
            access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
            access.location.id = device;
            access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-            CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
+            CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
            // add to the pool
            pool_size += reserve_size;
@ -372,7 +386,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
        GGML_ASSERT(pool_addr != 0);
-        void * ptr = (void *) (pool_addr + pool_used);
+        void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
        *actual_size = size;
        pool_used += size;
@ -391,17 +405,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
        pool_used -= size;
        // all deallocations must be in reverse order of the allocations
-        GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
+        GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
    }
 };
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_CUDA_NO_VMM)
 std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_CUDA_NO_VMM)
    if (ggml_cuda_info().devices[device].vmm) {
        return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
    }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_CUDA_NO_VMM)
    return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
 }
@ -547,7 +561,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
    cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
    if (err != cudaSuccess) {
        // clear the error
-        cudaGetLastError();
+        (void)cudaGetLastError();
        GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
        return nullptr;
    }
@ -962,7 +976,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
    cudaError_t err = cudaMallocHost((void **) &ptr, size);
    if (err != cudaSuccess) {
        // clear the error
-        cudaGetLastError();
+        (void)cudaGetLastError();
        GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                           size / 1024.0 / 1024.0, cudaGetErrorString(err));
        return nullptr;
@ -1209,7 +1223,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
                        CUDA_CHECK(err);
                    } else {
                        // reset the error
-                        cudaGetLastError();
+                        (void)cudaGetLastError();
                    }
                } else {
                    cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
@ -1217,7 +1231,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
                        CUDA_CHECK(err);
                    } else {
                        // reset the error
-                        cudaGetLastError();
+                        (void)cudaGetLastError();
                    }
                }
            }
@ -2452,7 +2466,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
                    if (stat == cudaErrorInvalidDeviceFunction) {
                        // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
                        // We don't need to update blas nodes, so clear error and move on.
-                        cudaGetLastError();
+                        (void)cudaGetLastError();
                    } else {
                        GGML_ASSERT(stat == cudaSuccess);
                    }
@ -2507,14 +2521,20 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
 static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
    cudaGraphExecUpdateResultInfo result_info;
 #ifdef __HIP_PLATFORM_AMD__
    hipGraphNode_t errorNode;
    hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
 #else
    cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
 #endif
    if (stat == cudaErrorGraphExecUpdateFailure) {
 #ifndef NDEBUG
        GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
 #endif
        // The pre-existing graph exec cannot be updated due to violated constraints
        // so instead clear error and re-instantiate
-        cudaGetLastError();
+        (void)cudaGetLastError();
        CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
        cuda_ctx->cuda_graph->instance = nullptr;
        CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
@ -2742,7 +2762,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
    cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
    if (err != cudaSuccess) {
        // clear the error
-        cudaGetLastError();
+        (void)cudaGetLastError();
        GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
                           size / 1024.0 / 1024.0, cudaGetErrorString(err));
@ -2762,7 +2782,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
    cudaError_t err = cudaHostUnregister(buffer);
    if (err != cudaSuccess) {
        // clear the error
-        cudaGetLastError();
+        (void)cudaGetLastError();
    }
 }
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@ -19,6 +19,12 @@
 #define CUBLAS_TF32_TENSOR_OP_MATH 0
 #define CUDA_R_16F  HIPBLAS_R_16F
 #define CUDA_R_32F  HIPBLAS_R_32F
 #define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
 #define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
 #define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
 #define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
 #define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
 #define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
 #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
 #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
 #define cublasCreate hipblasCreate
@ -74,6 +80,21 @@
 #define cudaMemGetInfo hipMemGetInfo
 #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
 #define cudaSetDevice hipSetDevice
 #define cuDeviceGet hipDeviceGet
 #define CUdevice hipDevice_t
 #define CUdeviceptr hipDeviceptr_t
 #define cuMemUnmap hipMemUnmap
 #define CUmemAccessDesc hipMemAccessDesc
 #define cuMemAddressFree hipMemAddressFree
 #define cuMemRelease hipMemRelease
 #define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
 #define cuMemCreate hipMemCreate
 #define cuMemAddressReserve hipMemAddressReserve
 #define cuMemMap hipMemMap
 #define cuMemSetAccess hipMemSetAccess
 #define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
 #define CUmemAllocationProp hipMemAllocationProp
 #define cuDeviceGetAttribute hipDeviceGetAttribute
 #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
 #define cudaStreamDestroy hipStreamDestroy
 #define cudaStreamFireAndForget hipStreamFireAndForget
@ -81,6 +102,28 @@
 #define cudaStreamPerThread hipStreamPerThread
 #define cudaStreamSynchronize hipStreamSynchronize
 #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
 #define cudaGraphExec_t hipGraphExec_t
 #define cudaGraphNode_t hipGraphNode_t
 #define cudaKernelNodeParams hipKernelNodeParams
 #define cudaKernelNodeParams hipKernelNodeParams
 #define cudaGraphExecDestroy hipGraphExecDestroy
 #define cudaGraphLaunch hipGraphLaunch
 #define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
 #define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
 #define cudaGraphNodeType hipGraphNodeType
 #define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
 #define cudaGraphInstantiate hipGraphInstantiate
 #define cudaStreamEndCapture hipStreamEndCapture
 #define cudaGraphDestroy hipGraphDestroy
 #define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
 #define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
 #define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
 #define cudaGraphNodeGetType hipGraphNodeGetType
 #define cudaGraphGetNodes hipGraphGetNodes
 #define cudaGraphExecUpdate hipGraphExecUpdate
 #define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
 #define cudaStreamBeginCapture hipStreamBeginCapture
 #define cudaGraph_t hipGraph_t
 #define cudaStream_t hipStream_t
 #define cudaSuccess hipSuccess
 #define __trap() do { abort(); __builtin_unreachable(); } while(0)
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@ -92,6 +92,14 @@ if (GGML_CUDA_NO_PEER_COPY)
    add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
 endif()
 if (GGML_HIP_GRAPHS)
    add_compile_definitions(GGML_HIP_GRAPHS)
 endif()
 if (GGML_CUDA_NO_VMM)
    add_compile_definitions(GGML_CUDA_NO_VMM)
 endif()
 if (CXX_IS_HIPCC)
    set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
    target_link_libraries(ggml-hip PRIVATE hip::device)