mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-04 16:30:58 +02:00
Compare commits
2 Commits
gg/cuda-no
...
gg/cuda-fi
Author | SHA1 | Date | |
---|---|---|---|
13c5446759 | |||
9df6298a91 |
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@ -459,7 +459,7 @@ jobs:
|
|||||||
path: build/bin/${{ matrix.build }}
|
path: build/bin/${{ matrix.build }}
|
||||||
|
|
||||||
windows-cublas:
|
windows-cublas:
|
||||||
runs-on: windows-2019
|
runs-on: windows-latest
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
|
2
Makefile
2
Makefile
@ -297,10 +297,10 @@ ggml-cuda/%.o: ggml-cuda/%.cu ggml-cuda/%.cuh ggml.h ggml-common.h ggml-cuda/com
|
|||||||
|
|
||||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml.h ggml-backend.h ggml-backend-impl.h ggml-common.h $(wildcard ggml-cuda/*.cuh)
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
|
endif
|
||||||
|
|
||||||
whisper-mel-cuda.o: whisper-mel-cuda.cu whisper.h ggml.h ggml-backend.h whisper-mel.hpp whisper-mel-cuda.hpp
|
whisper-mel-cuda.o: whisper-mel-cuda.cu whisper.h ggml.h ggml-backend.h whisper-mel.hpp whisper-mel-cuda.hpp
|
||||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||||
endif
|
|
||||||
|
|
||||||
ifdef WHISPER_HIPBLAS
|
ifdef WHISPER_HIPBLAS
|
||||||
ROCM_PATH ?= /opt/rocm
|
ROCM_PATH ?= /opt/rocm
|
||||||
|
@ -203,14 +203,14 @@ public:
|
|||||||
// create Hann window
|
// create Hann window
|
||||||
{
|
{
|
||||||
auto hw = whisper_mel_calc::hann_window();
|
auto hw = whisper_mel_calc::hann_window();
|
||||||
CUDA_CHECK(cudaMalloc(&m_hann_window, hw.len * sizeof(float)));
|
CUDA_CHECK(cudaMallocAsync(&m_hann_window, hw.len * sizeof(float), m_stream));
|
||||||
CUDA_CHECK(cudaMemcpyAsync(m_hann_window, hw.data, hw.len * sizeof(float), cudaMemcpyHostToDevice, m_stream));
|
CUDA_CHECK(cudaMemcpyAsync(m_hann_window, hw.data, hw.len * sizeof(float), cudaMemcpyHostToDevice, m_stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
// fill filters
|
// fill filters
|
||||||
{
|
{
|
||||||
auto& f = filters.data;
|
auto& f = filters.data;
|
||||||
CUDA_CHECK(cudaMalloc(&m_filters, f.size() * sizeof(float)));
|
CUDA_CHECK(cudaMallocAsync(&m_filters, f.size() * sizeof(float), m_stream));
|
||||||
CUDA_CHECK(cudaMemcpyAsync(m_filters, f.data(), f.size() * sizeof(float), cudaMemcpyHostToDevice, m_stream));
|
CUDA_CHECK(cudaMemcpyAsync(m_filters, f.data(), f.size() * sizeof(float), cudaMemcpyHostToDevice, m_stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -243,7 +243,7 @@ public:
|
|||||||
m_cufft_workspace = nullptr;
|
m_cufft_workspace = nullptr;
|
||||||
}
|
}
|
||||||
CUFFT_CHECK(cufftEstimate1d(WHISPER_N_FFT, CUFFT_R2C, max_frames, &m_cufft_workspace_size));
|
CUFFT_CHECK(cufftEstimate1d(WHISPER_N_FFT, CUFFT_R2C, max_frames, &m_cufft_workspace_size));
|
||||||
CUDA_CHECK(cudaMalloc(&m_cufft_workspace, m_cufft_workspace_size));
|
CUDA_CHECK(cudaMallocAsync(&m_cufft_workspace, m_cufft_workspace_size, m_stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
// device reduce working area
|
// device reduce working area
|
||||||
@ -261,7 +261,7 @@ public:
|
|||||||
cub::DeviceReduce::Max(nullptr, nbytes, temp, temp, max_frames * max_mels);
|
cub::DeviceReduce::Max(nullptr, nbytes, temp, temp, max_frames * max_mels);
|
||||||
m_log_mel_temp_storage_size = nbytes + LOG_MEL_PREFIX_SIZE;
|
m_log_mel_temp_storage_size = nbytes + LOG_MEL_PREFIX_SIZE;
|
||||||
|
|
||||||
CUDA_CHECK(cudaMalloc(&m_log_mel_temp_storage, m_log_mel_temp_storage_size));
|
CUDA_CHECK(cudaMallocAsync(&m_log_mel_temp_storage, m_log_mel_temp_storage_size, m_stream));
|
||||||
}
|
}
|
||||||
|
|
||||||
m_n_max_samples = n_samples;
|
m_n_max_samples = n_samples;
|
||||||
@ -286,16 +286,16 @@ public:
|
|||||||
const auto n_frames = 1 + (padded_samples.size() - WHISPER_N_FFT) / WHISPER_HOP_LENGTH;
|
const auto n_frames = 1 + (padded_samples.size() - WHISPER_N_FFT) / WHISPER_HOP_LENGTH;
|
||||||
|
|
||||||
float * cu_padded_samples = nullptr;
|
float * cu_padded_samples = nullptr;
|
||||||
CUDA_CHECK(cudaMalloc(&cu_padded_samples, padded_samples.size() * sizeof(float)));
|
CUDA_CHECK(cudaMallocAsync(&cu_padded_samples, padded_samples.size() * sizeof(float), m_stream));
|
||||||
CUDA_CHECK(cudaMemcpyAsync(cu_padded_samples, padded_samples.data(), padded_samples.size() * sizeof(float), cudaMemcpyHostToDevice, m_stream));
|
CUDA_CHECK(cudaMemcpyAsync(cu_padded_samples, padded_samples.data(), padded_samples.size() * sizeof(float), cudaMemcpyHostToDevice, m_stream));
|
||||||
|
|
||||||
float * stft_in = nullptr; // contiguous buffer for stft input
|
float * stft_in = nullptr; // contiguous buffer for stft input
|
||||||
CUDA_CHECK(cudaMalloc(&stft_in, n_frames * WHISPER_N_FFT * sizeof(float)));
|
CUDA_CHECK(cudaMallocAsync(&stft_in, n_frames * WHISPER_N_FFT * sizeof(float), m_stream));
|
||||||
|
|
||||||
fill_stft_input(cu_padded_samples, int(n_frames), m_hann_window, stft_in, m_stream);
|
fill_stft_input(cu_padded_samples, int(n_frames), m_hann_window, stft_in, m_stream);
|
||||||
|
|
||||||
cufftComplex* stft_out;
|
cufftComplex* stft_out;
|
||||||
CUDA_CHECK(cudaMalloc(&stft_out, n_frames * WHISPER_N_FFT_HALF * sizeof(cufftComplex)));
|
CUDA_CHECK(cudaMallocAsync(&stft_out, n_frames * WHISPER_N_FFT_HALF * sizeof(cufftComplex), m_stream));
|
||||||
|
|
||||||
cufftHandle plan;
|
cufftHandle plan;
|
||||||
CUFFT_CHECK(cufftCreate(&plan));
|
CUFFT_CHECK(cufftCreate(&plan));
|
||||||
@ -311,11 +311,11 @@ public:
|
|||||||
|
|
||||||
const auto n_mag_frames = n_frames - 1; // drop last frame
|
const auto n_mag_frames = n_frames - 1; // drop last frame
|
||||||
float * magnitudes;
|
float * magnitudes;
|
||||||
CUDA_CHECK(cudaMalloc(&magnitudes, n_mag_frames * WHISPER_N_FFT_HALF * sizeof(float)));
|
CUDA_CHECK(cudaMallocAsync(&magnitudes, n_mag_frames * WHISPER_N_FFT_HALF * sizeof(float), m_stream));
|
||||||
calc_magnitudes(stft_out, int(n_mag_frames), magnitudes, m_stream);
|
calc_magnitudes(stft_out, int(n_mag_frames), magnitudes, m_stream);
|
||||||
|
|
||||||
float * mel_data = nullptr;
|
float * mel_data = nullptr;
|
||||||
CUDA_CHECK(cudaMalloc(&mel_data, m_n_mel * n_mag_frames * sizeof(float)));
|
CUDA_CHECK(cudaMallocAsync(&mel_data, m_n_mel * n_mag_frames * sizeof(float), m_stream));
|
||||||
|
|
||||||
const float fone = 1.0f, fzero = 0.0f;
|
const float fone = 1.0f, fzero = 0.0f;
|
||||||
CUBLAS_CHECK(cublasSgemm(m_cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N,
|
CUBLAS_CHECK(cublasSgemm(m_cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N,
|
||||||
@ -343,11 +343,11 @@ public:
|
|||||||
|
|
||||||
// cleanup
|
// cleanup
|
||||||
CUFFT_CHECK(cufftDestroy(plan));
|
CUFFT_CHECK(cufftDestroy(plan));
|
||||||
CUDA_CHECK(cudaFree(mel_data));
|
CUDA_CHECK(cudaFreeAsync(mel_data, m_stream));
|
||||||
CUDA_CHECK(cudaFree(magnitudes));
|
CUDA_CHECK(cudaFreeAsync(magnitudes, m_stream));
|
||||||
CUDA_CHECK(cudaFree(stft_out));
|
CUDA_CHECK(cudaFreeAsync(stft_out, m_stream));
|
||||||
CUDA_CHECK(cudaFree(stft_in));
|
CUDA_CHECK(cudaFreeAsync(stft_in, m_stream));
|
||||||
CUDA_CHECK(cudaFree(cu_padded_samples));
|
CUDA_CHECK(cudaFreeAsync(cu_padded_samples, m_stream));
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
@ -3167,7 +3167,7 @@ struct mel_calc_cpu : public whisper_mel_calc {
|
|||||||
}
|
}
|
||||||
|
|
||||||
whisper_mel_calc * whisper_mel_calc_create(ggml_backend_t backend, const whisper_filters & filters) {
|
whisper_mel_calc * whisper_mel_calc_create(ggml_backend_t backend, const whisper_filters & filters) {
|
||||||
#if defined(GGML_USE_CUDA) && !defined(GGML_USE_HIPBLAS)
|
#if GGML_USE_CUDA
|
||||||
if (ggml_backend_is_cuda(backend)) {
|
if (ggml_backend_is_cuda(backend)) {
|
||||||
auto ret = whisper_mel_calc_create_cuda(backend, filters);
|
auto ret = whisper_mel_calc_create_cuda(backend, filters);
|
||||||
// run a warmup to avoid the first kernel launch overhead (thus we get the best perf even on the first run)
|
// run a warmup to avoid the first kernel launch overhead (thus we get the best perf even on the first run)
|
||||||
|
Reference in New Issue
Block a user