cuda : avoid async allocs in CUDA mel code

cuda : fix HIPBLAS build (#2234 )
cuda : fix bounds check for src0 rows in MMVQ kernel (#2231 )
2025-08-10 16:59:08 +02:00 · 2024-06-12 09:52:15 +03:00 · 2024-06-11 19:14:38 +03:00 · 2024-06-11 17:39:01 +03:00 · 2024-06-11 17:21:30 +03:00
4 changed files with 17 additions and 17 deletions
--- a/2
+++ b/2
@ -297,10 +297,10 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com

 ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
-endif

 whisper-mel-cuda.o: whisper-mel-cuda.cu whisper.h ggml.h ggml-backend.h whisper-mel.hpp whisper-mel-cuda.hpp
 	$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
+endif

 ifdef WHISPER_HIPBLAS
 	ROCM_PATH   ?= /opt/rocm
--- a/ggml-cuda/mmvq.cu
+++ b/ggml-cuda/mmvq.cu
@ -75,7 +75,7 @@ static __global__ void mul_mat_vec_q(
            tmp[j][i] = warp_reduce_sum(tmp[j][i]);
        }

-        if (threadIdx.x < rows_per_cuda_block) {
+        if (threadIdx.x < rows_per_cuda_block && (rows_per_cuda_block == 1 || row0 + threadIdx.x < nrows_dst)) {
            dst[j*nrows_dst + row0 + threadIdx.x] = tmp[j][threadIdx.x];
        }
    }
--- a/whisper-mel-cuda.cu
+++ b/whisper-mel-cuda.cu
@ -203,14 +203,14 @@ public:
        // create Hann window
        {
            auto hw = whisper_mel_calc::hann_window();
-            CUDA_CHECK(cudaMallocAsync(&m_hann_window, hw.len * sizeof(float), m_stream));
+            CUDA_CHECK(cudaMalloc(&m_hann_window, hw.len * sizeof(float)));
            CUDA_CHECK(cudaMemcpyAsync(m_hann_window, hw.data, hw.len * sizeof(float), cudaMemcpyHostToDevice, m_stream));
        }

        // fill filters
        {
            auto& f = filters.data;
-            CUDA_CHECK(cudaMallocAsync(&m_filters, f.size() * sizeof(float), m_stream));
+            CUDA_CHECK(cudaMalloc(&m_filters, f.size() * sizeof(float)));
            CUDA_CHECK(cudaMemcpyAsync(m_filters, f.data(), f.size() * sizeof(float), cudaMemcpyHostToDevice, m_stream));
        }

@ -243,7 +243,7 @@ public:
                m_cufft_workspace = nullptr;
            }
            CUFFT_CHECK(cufftEstimate1d(WHISPER_N_FFT, CUFFT_R2C, max_frames, &m_cufft_workspace_size));
-            CUDA_CHECK(cudaMallocAsync(&m_cufft_workspace, m_cufft_workspace_size, m_stream));
+            CUDA_CHECK(cudaMalloc(&m_cufft_workspace, m_cufft_workspace_size));
        }

        // device reduce working area
@ -261,7 +261,7 @@ public:
            cub::DeviceReduce::Max(nullptr, nbytes, temp, temp, max_frames * max_mels);
            m_log_mel_temp_storage_size = nbytes + LOG_MEL_PREFIX_SIZE;

-            CUDA_CHECK(cudaMallocAsync(&m_log_mel_temp_storage, m_log_mel_temp_storage_size, m_stream));
+            CUDA_CHECK(cudaMalloc(&m_log_mel_temp_storage, m_log_mel_temp_storage_size));
        }

        m_n_max_samples = n_samples;
@ -286,16 +286,16 @@ public:
        const auto n_frames = 1 + (padded_samples.size() - WHISPER_N_FFT) / WHISPER_HOP_LENGTH;

        float * cu_padded_samples = nullptr;
-        CUDA_CHECK(cudaMallocAsync(&cu_padded_samples, padded_samples.size() * sizeof(float), m_stream));
+        CUDA_CHECK(cudaMalloc(&cu_padded_samples, padded_samples.size() * sizeof(float)));
        CUDA_CHECK(cudaMemcpyAsync(cu_padded_samples, padded_samples.data(), padded_samples.size() * sizeof(float), cudaMemcpyHostToDevice, m_stream));

        float * stft_in = nullptr; // contiguous buffer for stft input
-        CUDA_CHECK(cudaMallocAsync(&stft_in, n_frames * WHISPER_N_FFT * sizeof(float), m_stream));
+        CUDA_CHECK(cudaMalloc(&stft_in, n_frames * WHISPER_N_FFT * sizeof(float)));

        fill_stft_input(cu_padded_samples, int(n_frames), m_hann_window, stft_in, m_stream);

        cufftComplex* stft_out;
-        CUDA_CHECK(cudaMallocAsync(&stft_out, n_frames * WHISPER_N_FFT_HALF * sizeof(cufftComplex), m_stream));
+        CUDA_CHECK(cudaMalloc(&stft_out, n_frames * WHISPER_N_FFT_HALF * sizeof(cufftComplex)));

        cufftHandle plan;
        CUFFT_CHECK(cufftCreate(&plan));
@ -311,11 +311,11 @@ public:

        const auto n_mag_frames = n_frames - 1; // drop last frame
        float * magnitudes;
-        CUDA_CHECK(cudaMallocAsync(&magnitudes, n_mag_frames * WHISPER_N_FFT_HALF * sizeof(float), m_stream));
+        CUDA_CHECK(cudaMalloc(&magnitudes, n_mag_frames * WHISPER_N_FFT_HALF * sizeof(float)));
        calc_magnitudes(stft_out, int(n_mag_frames), magnitudes, m_stream);

        float * mel_data = nullptr;
-        CUDA_CHECK(cudaMallocAsync(&mel_data, m_n_mel * n_mag_frames * sizeof(float), m_stream));
+        CUDA_CHECK(cudaMalloc(&mel_data, m_n_mel * n_mag_frames * sizeof(float)));

        const float fone = 1.0f, fzero = 0.0f;
        CUBLAS_CHECK(cublasSgemm(m_cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N,
@ -343,11 +343,11 @@ public:

        // cleanup
        CUFFT_CHECK(cufftDestroy(plan));
-        CUDA_CHECK(cudaFreeAsync(mel_data, m_stream));
-        CUDA_CHECK(cudaFreeAsync(magnitudes, m_stream));
-        CUDA_CHECK(cudaFreeAsync(stft_out, m_stream));
-        CUDA_CHECK(cudaFreeAsync(stft_in, m_stream));
-        CUDA_CHECK(cudaFreeAsync(cu_padded_samples, m_stream));
+        CUDA_CHECK(cudaFree(mel_data));
+        CUDA_CHECK(cudaFree(magnitudes));
+        CUDA_CHECK(cudaFree(stft_out));
+        CUDA_CHECK(cudaFree(stft_in));
+        CUDA_CHECK(cudaFree(cu_padded_samples));

        return ret;
    }
--- a/whisper.cpp
+++ b/whisper.cpp
@ -3167,7 +3167,7 @@ struct mel_calc_cpu : public whisper_mel_calc {
 }

 whisper_mel_calc * whisper_mel_calc_create(ggml_backend_t backend, const whisper_filters & filters) {
-#if GGML_USE_CUDA
+#if defined(GGML_USE_CUDA) && !defined(GGML_USE_HIPBLAS)
    if (ggml_backend_is_cuda(backend)) {
        auto ret = whisper_mel_calc_create_cuda(backend, filters);
        // run a warmup to avoid the first kernel launch overhead (thus we get the best perf even on the first run)
Author	SHA1	Message	Date
Georgi Gerganov	267e15a46d	cuda : avoid async allocs in CUDA mel code	2024-06-12 09:52:15 +03:00
Georgi Gerganov	420b6abc54	cuda : fix HIPBLAS build (#2234 )	2024-06-11 19:14:38 +03:00
Georgi Gerganov	99804b0f3e	cuda : fix bounds check for src0 rows in MMVQ kernel (#2231 ) * cuda : fix bounds check for src0 rows in MMVQ kernel * Update ggml-cuda/mmvq.cu Co-authored-by: Johannes Gäßler <johannesg@5d6.de> --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>	2024-06-11 17:39:01 +03:00
Georgi Gerganov	c55964c956	ci : fix CUDA builds (#2232 )	2024-06-11 17:21:30 +03:00