whisper : reorganize source code + improve CMake (#2256)

* scripts : update sync [no ci]

* files : reorganize [no ci]

* sync : llama.cpp

* cmake : link math library

* cmake : build normal ggml library

* files : move headers to include

* objc : fix path to ggml-metal.h

* ci : fix WHISPER_CUDA -> GGML_CUDA

* scripts : sync LICENSE [no ci]
This commit is contained in:
Georgi Gerganov 2024-06-26 19:34:09 +03:00 committed by GitHub
parent bf4cb4abad
commit e30c679928
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
361 changed files with 176536 additions and 20466 deletions

View File

@ -21,7 +21,7 @@ COPY . .
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV WHISPER_CUBLAS=1
ENV GGML_CUDA=1
RUN make

View File

@ -14,7 +14,7 @@ ARG CUDA_DOCKER_ARCH=all
# Set nvcc architecture
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV WHISPER_CUBLAS=1
ENV GGML_CUDA=1
RUN apt-get update && \
apt-get install -y build-essential \

View File

@ -101,7 +101,10 @@ jobs:
fail-fast: false
matrix:
build: [Debug, Release]
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
#arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
# TODO: arm/v7 disabled due to clang bug
# https://github.com/ggerganov/whisper.cpp/actions/runs/9657764109/job/26637633042?pr=2256#step:4:1990
arch: [linux/amd64, linux/arm64, linux/ppc64le]
steps:
- name: Clone
@ -197,7 +200,7 @@ jobs:
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake --build . --config Release -j $(nproc)
ubuntu-22-cmake-sycl-fp16:
@ -247,7 +250,7 @@ jobs:
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake -DWHISPER_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake -DGGML_SYCL_F16=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx ..
cmake --build . --config Release -j $(nproc)
windows-msys2:
@ -289,7 +292,7 @@ jobs:
- name: Build using make w/ OpenBLAS
shell: msys2 {0}
run: |
make WHISPER_OPENBLAS=1 -j $(nproc)
make GGML_OPENBLAS=1 -j $(nproc)
- name: Build using CMake
shell: msys2 {0}
@ -305,7 +308,7 @@ jobs:
- name: Build using CMake w/ OpenBLAS
shell: msys2 {0}
run: |
cmake -B build -DWHISPER_OPENBLAS=ON
cmake -B build -DGGML_OPENBLAS=ON
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows:
@ -381,12 +384,9 @@ jobs:
- arch: Win32
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
s2arc: x86
clblast: OFF
- arch: x64
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
s2arc: x64
clblast: ON
clver: 1.6.1
- sdl2: ON
s2ver: 2.28.5
@ -413,26 +413,13 @@ jobs:
7z x sdl2.zip
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
- name: Install OpenCL
if: matrix.clblast == 'ON'
run: vcpkg.exe --triplet=${{ matrix.arch }}-windows install opencl
- name: Fetch CLBlast and set CLBlast_DIR
if: matrix.clblast == 'ON'
run: |
C:/msys64/usr/bin/wget.exe -qO clblast.zip https://github.com/CNugteren/CLBlast/releases/download/${{ matrix.clver }}/CLBlast-${{ matrix.clver }}-windows-x64.zip
7z x clblast.zip
7z x CLBlast-${{ matrix.clver }}-windows-x64.7z
echo "CLBlast_DIR=$env:GITHUB_WORKSPACE/CLBlast-${{ matrix.clver }}-windows-x64/lib/cmake/CLBlast" >> $env:GITHUB_ENV
- name: Configure
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DWHISPER_OPENBLAS=${{ matrix.blas }}
-DGGML_OPENBLAS=${{ matrix.blas }}
-DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
-DWHISPER_SDL2=${{ matrix.sdl2 }}
-DWHISPER_CLBLAST=${{ matrix.clblast }}
- name: Build
run: |
@ -447,15 +434,11 @@ jobs:
if: matrix.sdl2 == 'ON'
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
- name: Copy clblast.dll
if: matrix.clblast == 'ON'
run: copy "$env:CLBlast_DIR/../../clblast.dll" build/bin/${{ matrix.build }}
- name: Upload binaries
if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
uses: actions/upload-artifact@v4
with:
name: whisper-blas${{ matrix.clblast == 'ON' && '-clblast' || ''}}-bin-${{ matrix.arch }}
name: whisper-blas-bin-${{ matrix.arch }}
path: build/bin/${{ matrix.build }}
windows-cublas:
@ -498,7 +481,7 @@ jobs:
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DWHISPER_CUDA=${{ matrix.cublas }}
-DGGML_CUDA=${{ matrix.cublas }}
-DWHISPER_SDL2=${{ matrix.sdl2 }}
- name: Build ${{ matrix.cuda-toolkit }}

12
.gitignore vendored
View File

@ -10,17 +10,7 @@
/CMakeSettings.json
build/
build-blas/
build-coreml/
build-em/
build-debug/
build-release/
build-rwdi/
build-static/
build-cublas/
build-no-accel/
build-sanitize-addr/
build-sanitize-thread/
build-*/
# SPM
.build/

3
.gitmodules vendored
View File

@ -1,3 +0,0 @@
[submodule "bindings/ios"]
path = bindings/ios
url = https://github.com/ggerganov/whisper.spm

View File

@ -1,25 +1,31 @@
cmake_minimum_required (VERSION 3.5)
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
project("whisper.cpp" C CXX)
project("whisper.cpp" VERSION 1.6.2)
include(CheckIncludeFileCXX)
# Allow for the creation of solution folders.
set_property(GLOBAL PROPERTY USE_FOLDERS ON)
project(whisper.cpp VERSION 1.6.2)
set(SOVERSION 1)
#set(CMAKE_WARN_DEPRECATED YES)
set(CMAKE_WARN_UNUSED_CLI YES)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()
# Add path to modules
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(WHISPER_STANDALONE ON)
include(GitVars)
include(BuildTypes)
include(git-vars)
# configure project version
if (EXISTS "${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl")
configure_file(${CMAKE_SOURCE_DIR}/bindings/ios/Makefile-tmpl ${CMAKE_SOURCE_DIR}/bindings/ios/Makefile @ONLY)
endif()
configure_file(${CMAKE_SOURCE_DIR}/bindings/javascript/package-tmpl.json ${CMAKE_SOURCE_DIR}/bindings/javascript/package.json @ONLY)
else()
set(WHISPER_STANDALONE OFF)
@ -29,6 +35,11 @@ if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
option(WHISPER_WASM_SINGLE_FILE "whisper: embed WASM inside the generated whisper.js" ON)
# TODO: without these, we get the following error:
# wasm-ld: error: --shared-memory is disallowed by whisper.cpp.o because it was not compiled with 'atomics' or 'bulk-memory' features.
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -s TOTAL_STACK=5242880")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
else()
if (MINGW)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
@ -37,793 +48,145 @@ else()
endif()
endif()
# options
option(BUILD_SHARED_LIBS "build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
if (APPLE)
set(WHISPER_METAL_DEFAULT ON)
else()
set(WHISPER_METAL_DEFAULT OFF)
endif()
#
# option list
#
option(BUILD_SHARED_LIBS "whisper: build shared libs" ${BUILD_SHARED_LIBS_DEFAULT})
# general
option(WHISPER_CCACHE "whisper: use ccache if available" ON)
# debug
option(WHISPER_ALL_WARNINGS "whisper: enable all compiler warnings" ON)
option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
option(WHISPER_SANITIZE_THREAD "whisper: enable thread sanitizer" OFF)
option(WHISPER_SANITIZE_ADDRESS "whisper: enable address sanitizer" OFF)
option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)
option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
option(WHISPER_FFMPEG "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF)
endif()
option(WHISPER_NO_AVX "whisper: disable AVX" OFF)
option(WHISPER_NO_AVX2 "whisper: disable AVX2" OFF)
option(WHISPER_NO_AVX512 "whisper: disable AVX512" ON)
option(WHISPER_NO_AVX512_VBMI "whisper: disable AVX512-VBMI" ON)
option(WHISPER_NO_AVX512_VNNI "whisper: disable AVX512-VNNI" ON)
option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
option(WHISPER_NO_F16C "whisper: disable F16c" OFF)
option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
if (APPLE)
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
option(WHISPER_METAL "whisper: use Metal" ${WHISPER_METAL_DEFAULT})
option(WHISPER_METAL_NDEBUG "whisper: disable Metal debugging" OFF)
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
option(WHISPER_METAL_EMBED_LIBRARY "whisper: embed Metal library" OFF)
option(WHISPER_BLAS "whisper: use BLAS" ON)
set (WHISPER_BLAS_VENDOR "Apple" CACHE STRING
"whisper: BLAS library vendor")
else()
option(WHISPER_CUDA "whisper: support for CUDA" OFF)
option(WHISPER_CUDA_FA_ALL_QUANTS "whisper: compile all quants for FlashAttention" OFF)
option(WHISPER_CUBLAS "whisper: support for CUDA (deprecated)" OFF)
option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF)
option(WHISPER_CLBLAST "whisper: use CLBlast" OFF)
option(WHISPER_MKL "whisper: use Intel Math Kernel Library (MKL)" OFF)
option(WHISPER_SYCL "whisper: use SYCL" OFF)
option(WHISPER_SYCL_F16 "whisper: use 16 bit floats for sycl calculations" OFF)
option(WHISPER_BLAS "whisper: use BLAS" OFF)
set (WHISPER_BLAS_VENDOR "Generic" CACHE STRING
"whisper: BLAS library vendor")
endif()
option(WHISPER_PERF "whisper: enable perf timings" OFF)
# build
option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)
# sanitizers
option(WHISPER_SANITIZE_THREAD "whisper: enable thread sanitizer" OFF)
option(WHISPER_SANITIZE_ADDRESS "whisper: enable address sanitizer" OFF)
option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF)
if (NOT MSVC)
if (WHISPER_SANITIZE_THREAD)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=thread")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=thread")
endif()
# extra artifacts
option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
option(WHISPER_BUILD_SERVER "whisper: build server example" ${WHISPER_STANDALONE})
if (WHISPER_SANITIZE_ADDRESS)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -fno-omit-frame-pointer")
endif()
# 3rd party libs
option(WHISPER_CURL "whisper: use libcurl to download model from an URL" OFF)
option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
if (WHISPER_SANITIZE_UNDEFINED)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=undefined")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=undefined")
endif()
endif()
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffast-math")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=native")
# dependencies
find_package(Threads REQUIRED)
#compile flag sycl
if (WHISPER_SYCL)
set(CMAKE_CXX_STANDARD 17)
else()
set(CMAKE_CXX_STANDARD 11)
endif()
if (WHISPER_FFMPEG)
# As of cmake 3.27, there is no official cmake support for FindFFmpeg.
# Consequnelty we added a FindFFmpeg.cmake script the cmake subfolder:
# whisper.cpp does not need the full ffmpeg libs, just AVFORMAT AVCODEC AVUTIL SWRESAMPLE
# libswresample performs highly optimized audio resampling, rematrixing and sample format conversion operations
# libavcodec provides a generic encoding/decoding framework and contains multiple decoders and encoders for audio, video and subtitle streams, and several bitstream filters.
# libavformat provides a generic framework for multiplexing and demultiplexing (muxing and demuxing) audio, video and subtitle streams.
find_package(FFmpeg REQUIRED)
if (NOT ${FFMPEG_FOUND})
message(FATAL_ERROR "Cannot find ffmpeg libs/headers")
endif()
message(STATUS "Found ffmpeg libs: ${FFMPEG_LIBRARIES}")
message(STATUS "Found ffmpeg headers in: ${FFMPEG_INCLUDE_DIRS}")
message(STATUS "ffmpeg definitions: ${FFMPEG_DEFINITIONS}")
message(STATUS "Found avformat ${AVFORMAT_VERSION}")
include_directories(${FFMPEG_INCLUDE_DIRS})
add_compile_definitions(WHISPER_FFMPEG)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${FFMPEG_LIBRARIES})
endif()
# on APPLE
if (APPLE)
# include Accelerate framework
if (NOT WHISPER_NO_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
if (ACCELERATE_FRAMEWORK)
message(STATUS "Accelerate framework found")
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64)
else()
message(FATAL_ERROR "Accelerate framework not found")
endif()
endif()
if (WHISPER_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
if (METAL_FRAMEWORK)
message(STATUS "Metal framework found")
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_METAL)
if (WHISPER_METAL_NDEBUG)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_NDEBUG)
endif()
else()
message(FATAL_ERROR "Metal framework not found")
endif()
set(GGML_SOURCES_METAL ggml-metal.m ggml-metal.h)
# copy ggml-common.h and ggml-metal.metal to bin directory
configure_file(ggml-common.h bin/ggml-common.h COPYONLY)
configure_file(ggml-metal.metal bin/ggml-metal.metal COPYONLY)
if (WHISPER_METAL_EMBED_LIBRARY)
enable_language(ASM)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_METAL_EMBED_LIBRARY)
set(METALLIB_SOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
set(COMMON_HEADER "${CMAKE_CURRENT_SOURCE_DIR}/ggml-common.h")
file(MAKE_DIRECTORY "${CMAKE_BINARY_DIR}/autogenerated")
set(EMBED_METALLIB_ASSEMBLY "${CMAKE_BINARY_DIR}/autogenerated/ggml-embed-metallib.s")
set(EMBED_METALLIB_SOURCE "${CMAKE_BINARY_DIR}/autogenerated/ggml-metal-combined.metal")
add_custom_command(
OUTPUT ${EMBED_METALLIB_SOURCE}
COMMAND sed -e "/^#include \\\"ggml-common.h\\\"/r ${COMMON_HEADER}" -e "/^#include \\\"ggml-common.h\\\"/d" ${METALLIB_SOURCE} > ${EMBED_METALLIB_SOURCE}
DEPENDS ${METALLIB_SOURCE} ${COMMON_HEADER}
COMMENT "Generating combined Metal library for embedding"
)
add_custom_command(
OUTPUT ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".section __DATA,__ggml_metallib" > ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".globl _ggml_metallib_start" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo "_ggml_metallib_start:" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".incbin \\\"${EMBED_METALLIB_SOURCE}\\\"" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo ".globl _ggml_metallib_end" >> ${EMBED_METALLIB_ASSEMBLY}
COMMAND echo "_ggml_metallib_end:" >> ${EMBED_METALLIB_ASSEMBLY}
DEPENDS ${EMBED_METALLIB_SOURCE}
COMMENT "Generate assembly for embedded Metal library"
)
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${EMBED_METALLIB_ASSEMBLY})
endif()
endif()
if (WHISPER_COREML)
find_library(FOUNDATION_FRAMEWORK Foundation)
find_library(COREML_FRAMEWORK CoreML)
if (COREML_FRAMEWORK)
message(STATUS "CoreML framework found")
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML)
else()
message(FATAL_ERROR "CoreML framework not found")
endif()
if (WHISPER_COREML_ALLOW_FALLBACK)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_COREML_ALLOW_FALLBACK)
endif()
endif()
endif()
if (WHISPER_BLAS)
if (WHISPER_STATIC)
set(BLA_STATIC ON)
endif()
#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
# set(BLA_SIZEOF_INTEGER 8)
#endif()
set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
find_package(BLAS)
if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
if (("${BLAS_INCLUDE_DIRS}" STREQUAL "") AND NOT (${WHISPER_BLAS_VENDOR} MATCHES "Apple"))
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
find_package(PkgConfig REQUIRED)
if (${WHISPER_BLAS_VENDOR} MATCHES "Generic")
pkg_check_modules(DepBLAS REQUIRED blas)
elseif (${WHISPER_BLAS_VENDOR} MATCHES "OpenBLAS")
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
pkg_check_modules(DepBLAS openblas64)
if (NOT DepBLAS_FOUND)
pkg_check_modules(DepBLAS REQUIRED openblas)
endif()
elseif (${WHISPER_BLAS_VENDOR} MATCHES "FLAME")
pkg_check_modules(DepBLAS REQUIRED blis)
elseif (${WHISPER_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS REQUIRED blas-atlas)
elseif (${WHISPER_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
elseif (${WHISPER_BLAS_VENDOR} MATCHES "Intel")
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
elseif (${WHISPER_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own
if ("${NVHPC_VERSION}" STREQUAL "")
message(WARNING "Better to set NVHPC_VERSION")
else()
set(DepBLAS_FOUND ON)
set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
endif()
endif()
if (DepBLAS_FOUND)
set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
" detected by pkgconfig, trying to find cblas.h from possible paths...")
find_path(BLAS_INCLUDE_DIRS
NAMES cblas.h
HINTS
/usr/include
/usr/local/include
/usr/include/openblas
/opt/homebrew/opt/openblas/include
/usr/local/opt/openblas/include
/usr/include/x86_64-linux-gnu/openblas/include
)
endif()
endif()
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
add_compile_options(${BLAS_LINKER_FLAGS})
add_compile_definitions(GGML_USE_BLAS)
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${WHISPER_BLAS_VENDOR} MATCHES "Generic" OR ${WHISPER_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()
set(GGML_HEADERS_BLAS ggml-blas.h)
set(GGML_SOURCES_BLAS ggml-blas.cpp)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(WHISPER_EXTRA_INCLUDES ${WHISPER_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct WHISPER_BLAS_VENDOR")
endif()
endif()
if (WHISPER_MKL)
find_package(MKL CONFIG REQUIRED PATHS $ENV{MKLROOT})
message(STATUS "Imported oneMKL targets: ${MKL_IMPORTED_TARGETS}")
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_BLAS_USE_MKL)
endif()
if (WHISPER_CUBLAS)
message(WARNING "WHISPER_CUBLAS is deprecated and will be removed in the future.\nUse WHISPER_CUDA instead")
set(WHISPER_CUDA ON)
endif()
if (WHISPER_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard
# 60 == f16 CUDA intrinsics
# 61 == integer CUDA intrinsics
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
if (WHISPER_CUDA_F16 OR WHISPER_CUDA_DMMV_F16)
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
else()
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
endif()
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
enable_language(CUDA)
file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_CUDA ggml-cuda.h)
list(APPEND GGML_SOURCES_CUDA ggml-cuda.cu)
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
if (WHISPER_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
endif()
add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
if (WHISPER_STATIC)
if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt CUDA::cufft)
else ()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static CUDA::cufft_static)
endif()
else()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cufft)
endif()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
else()
message(FATAL_ERROR "cuBLAS not found")
endif()
endif()
if (WHISPER_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
endif()
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()
find_package(hip)
find_package(hipblas)
find_package(rocblas)
if (${hipblas_FOUND} AND ${hip_FOUND})
message(STATUS "HIP and hipBLAS found")
set(GGML_HEADERS_ROCM "ggml-cuda.h")
file(GLOB GGML_SOURCES_ROCM "ggml-cuda/*.cu")
list(APPEND GGML_SOURCES_ROCM "ggml-cuda.cu")
file(GLOB SRCS "ggml-cuda/template-instances/fattn-wmma*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/mmq*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
if (WHISPER_CUDA_FA_ALL_QUANTS)
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
else()
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
file(GLOB SRCS "ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
list(APPEND GGML_SOURCES_CUDA ${SRCS})
endif()
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUDA)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
if (WHISPER_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
else()
message(FATAL_ERROR "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
endif()
endif()
if( WHISPER_OPENVINO )
find_package(OpenVINO REQUIRED COMPONENTS Runtime)
endif()
if (WHISPER_SYCL)
if ( NOT DEFINED ENV{ONEAPI_ROOT})
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
endif()
#todo: AOT
find_package(IntelSYCL REQUIRED)
if (WHISPER_SYCL_F16)
add_compile_definitions(GGML_SYCL_F16)
endif()
add_compile_definitions(GGML_USE_SYCL)
add_compile_options(-I./) #include DPCT
add_compile_options(-I/${SYCL_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
set(GGML_HEADERS_SYCL ggml-sycl.h)
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
endif()
# compiler flags
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "RelWithDebInfo")
endif ()
if (WHISPER_ALL_WARNINGS)
if (NOT MSVC)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} \
-Wall \
-Wextra \
-Wpedantic \
-Wshadow \
-Wcast-qual \
-Wstrict-prototypes \
-Wpointer-arith \
-Wno-unused-function \
")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} \
-Wall \
-Wextra \
-Wpedantic \
-Wcast-qual \
")
else()
# todo : msvc
endif()
endif()
if (NOT MSVC)
# TODO: temporary disabled until we figure out ggml-metal.m
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Werror=vla")
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fno-math-errno -ffinite-math-only -funsafe-math-optimizations")
endif()
message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected")
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
message(STATUS "PowerPC detected")
else()
message(STATUS "x86 detected")
if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /utf-8")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8")
if(NOT WHISPER_NO_AVX512)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX512")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX512")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX512")
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (NOT WHISPER_NO_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (NOT WHISPER_NO_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
elseif(NOT WHISPER_NO_AVX2)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
elseif(NOT WHISPER_NO_AVX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
endif()
else()
if (EMSCRIPTEN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -s TOTAL_STACK=5242880")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
else()
if(NOT WHISPER_NO_AVX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
endif()
if(NOT WHISPER_NO_AVX2)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx2")
endif()
if(NOT WHISPER_NO_AVX512)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw")
if(NOT WHISPER_NO_AVX512_VBMI)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vbmi")
endif()
if(NOT WHISPER_NO_AVX512_VNNI)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx512vnni")
endif()
endif()
if(NOT WHISPER_NO_FMA)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfma")
endif()
if(NOT WHISPER_NO_F16C)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mf16c")
endif()
endif()
endif()
endif()
#
# POSIX conformance
#
# clock_gettime came in POSIX.1b (1993)
# CLOCK_MONOTONIC came in POSIX.1-2001 / SUSv3 as optional
# posix_memalign came in POSIX.1-2001 / SUSv3
# M_PI is an XSI extension since POSIX.1-2001 / SUSv3, came in XPG1 (1985)
add_compile_definitions(_XOPEN_SOURCE=600)
# Somehow in OpenBSD whenever POSIX conformance is specified
# some string functions rely on locale_t availability,
# which was introduced in POSIX.1-2008, forcing us to go higher
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
remove_definitions(-D_XOPEN_SOURCE=600)
add_compile_definitions(_XOPEN_SOURCE=700)
endif()
# Data types, macros and functions related to controlling CPU affinity
# are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions(_GNU_SOURCE)
option(WHISPER_FFMPEG "whisper: support building and linking with ffmpeg libs (avcodec, swresample, ...)" OFF)
endif()
# RLIMIT_MEMLOCK came in BSD, is not specified in POSIX.1,
# and on macOS its availability depends on enabling Darwin extensions
# similarly on DragonFly, enabling BSD extensions is necessary
if (CMAKE_SYSTEM_NAME MATCHES "Darwin")
add_compile_definitions(_DARWIN_C_SOURCE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "DragonFly")
add_compile_definitions(_DARWIN_C_SOURCE)
endif()
option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
option(WHISPER_OPENVINO "whisper: support for OpenVINO" OFF)
# alloca is a non-standard interface that is not visible on BSDs when
# POSIX conformance is specified, but not all of them provide a clean way
# to enable it in such cases
if (CMAKE_SYSTEM_NAME MATCHES "FreeBSD")
add_compile_definitions(__BSD_VISIBLE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "NetBSD")
add_compile_definitions(_NETBSD_SOURCE)
endif()
if (CMAKE_SYSTEM_NAME MATCHES "OpenBSD")
add_compile_definitions(_BSD_SOURCE)
endif()
# Required for relocatable CMake package
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/build-info.cmake)
if (WHISPER_PERF)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_PERF)
endif()
# override ggml options
set(GGML_CCACHE ${WHISPER_CCACHE})
set(GGML_SANITIZE_THREAD ${WHISPER_SANITIZE_THREAD})
set(GGML_SANITIZE_ADDRESS ${WHISPER_SANITIZE_ADDRESS})
set(GGML_SANITIZE_UNDEFINED ${WHISPER_SANITIZE_UNDEFINED})
set(GGML_ALL_WARNINGS ${WHISPER_ALL_WARNINGS})
set(GGML_FATAL_WARNINGS ${WHISPER_FATAL_WARNINGS})
#
# whisper.coreml - Core ML support
#
if (WHISPER_COREML)
set(TARGET whisper.coreml)
add_library(${TARGET}
coreml/whisper-encoder.h
coreml/whisper-encoder.mm
coreml/whisper-encoder-impl.h
coreml/whisper-encoder-impl.m
)
include(DefaultTargetOptions)
target_include_directories(${TARGET} PUBLIC
.
)
target_link_libraries(${TARGET} PRIVATE ${FOUNDATION_FRAMEWORK} ${COREML_FRAMEWORK})
set_target_properties(${TARGET} PROPERTIES
COMPILE_FLAGS "-fobjc-arc"
)
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
endif()
if (WHISPER_OPENVINO)
set(TARGET whisper.openvino)
add_library(${TARGET} OBJECT
openvino/whisper-openvino-encoder.h
openvino/whisper-openvino-encoder.cpp
)
target_include_directories(${TARGET} PUBLIC
.
)
set_property(TARGET ${TARGET} PROPERTY POSITION_INDEPENDENT_CODE ON)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_OPENVINO)
target_link_libraries(${TARGET} PRIVATE openvino::runtime)
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
endif()
#
# whisper - this is the main library of the project
#
set(TARGET whisper)
add_library(${TARGET}
ggml.h
ggml.c
ggml-alloc.h
ggml-alloc.c
ggml-backend.h
ggml-backend.c
ggml-quants.h
ggml-quants.c
${GGML_SOURCES_METAL}
${GGML_SOURCES_CUDA}
${GGML_SOURCES_SYCL} ${GGML_HEADERS_SYCL}
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
whisper.h
whisper.cpp
)
if (WHISPER_CUDA)
target_sources(${TARGET} PRIVATE whisper-mel-cuda.cu)
endif()
include_directories (
.
)
# Set the version numbers
set_target_properties(whisper PROPERTIES
VERSION ${PROJECT_VERSION}
SOVERSION ${SOVERSION}
)
include(DefaultTargetOptions)
target_include_directories(${TARGET} PUBLIC
.
)
if (WHISPER_COREML)
target_link_libraries(${TARGET} PRIVATE whisper.coreml)
endif()
if (WHISPER_OPENVINO)
target_link_libraries(${TARGET} PRIVATE whisper.openvino)
endif()
if (WHISPER_MKL)
target_link_libraries(${TARGET} PUBLIC MKL::MKL)
endif()
if (MSVC)
target_link_libraries(${TARGET} PRIVATE ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -D_CRT_SECURE_NO_WARNINGS)
else()
target_link_libraries(${TARGET} PRIVATE m ${WHISPER_EXTRA_LIBS} ${CMAKE_THREAD_LIBS_INIT})
endif()
if (BUILD_SHARED_LIBS)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_link_libraries(${TARGET} PUBLIC
${CMAKE_DL_LIBS}
)
target_compile_definitions(${TARGET} PUBLIC
WHISPER_SHARED
GGML_SHARED
)
target_compile_definitions(${TARGET} PRIVATE
WHISPER_BUILD
GGML_BUILD
)
if (WHISPER_METAL)
# TODO: I think this should make ggml-metal.m "see" the ggml-metal.metal file from the "bin" directory
# but for some reason it does not work here like it does in llama.cpp
set_target_properties(${TARGET} PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/ggml-metal.metal")
# transition helpers
function (whisper_option_depr TYPE OLD NEW)
if (${OLD})
message(${TYPE} "${OLD} is deprecated and will be removed in the future.\nUse ${NEW} instead\n")
set(${NEW} ON)
endif()
endif()
endfunction()
if (GGML_SOURCES_CUDA)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
# Only configure gmml CUDA architectures is not globally set
if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
# Not overriden by user, so set defaults
set(GGML_CUDA_ARCHITECTURES 52 61 70)
endif()
message(STATUS "GGML Configuring CUDA architectures ${GGML_CUDA_ARCHITECTURES}")
set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES ${GGML_CUDA_ARCHITECTURES})
set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
endif()
whisper_option_depr(FATAL_ERROR WHISPER_CUBLAS GGML_CUDA)
whisper_option_depr(WARNING WHISPER_CUDA GGML_CUDA)
whisper_option_depr(WARNING WHISPER_KOMPUTE GGML_KOMPUTE)
whisper_option_depr(WARNING WHISPER_METAL GGML_METAL)
whisper_option_depr(WARNING WHISPER_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY)
whisper_option_depr(WARNING WHISPER_NATIVE GGML_NATIVE)
whisper_option_depr(WARNING WHISPER_OPENMP GGML_OPENMP)
whisper_option_depr(WARNING WHISPER_RPC GGML_RPC)
whisper_option_depr(WARNING WHISPER_SYCL GGML_SYCL)
whisper_option_depr(WARNING WHISPER_SYCL_F16 GGML_SYCL_F16)
if (EMSCRIPTEN)
set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
endif()
#
# build the library
#
target_compile_definitions(${TARGET} PUBLIC
${WHISPER_EXTRA_FLAGS}
)
add_subdirectory(ggml)
add_subdirectory(src)
set_target_properties(${TARGET} PROPERTIES PUBLIC_HEADER "ggml.h;whisper.h")
set_target_properties(${TARGET} PROPERTIES FOLDER "libs")
#
# install
#
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
install(TARGETS ${TARGET}
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib/static
RUNTIME DESTINATION bin
RESOURCE DESTINATION bin
PUBLIC_HEADER DESTINATION include
)
set(WHISPER_BUILD_NUMBER ${BUILD_NUMBER})
set(WHISPER_BUILD_COMMIT ${BUILD_COMMIT})
set(WHISPER_INSTALL_VERSION ${CMAKE_PROJECT_VERSION})
#
# bindings
#
set(WHISPER_INCLUDE_INSTALL_DIR ${CMAKE_INSTALL_INCLUDEDIR} CACHE PATH "Location of header files")
set(WHISPER_LIB_INSTALL_DIR ${CMAKE_INSTALL_LIBDIR} CACHE PATH "Location of library files")
set(WHISPER_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location of binary files")
add_subdirectory(bindings)
get_directory_property(WHISPER_TRANSIENT_DEFINES COMPILE_DEFINITIONS)
set_target_properties(whisper PROPERTIES PUBLIC_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/include/whisper.h)
install(TARGETS whisper LIBRARY PUBLIC_HEADER)
configure_package_config_file(
${CMAKE_CURRENT_SOURCE_DIR}/cmake/whisper-config.cmake.in
${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper
PATH_VARS
WHISPER_INCLUDE_INSTALL_DIR
WHISPER_LIB_INSTALL_DIR
WHISPER_BIN_INSTALL_DIR )
write_basic_package_version_file(
${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
VERSION ${WHISPER_INSTALL_VERSION}
COMPATIBILITY SameMajorVersion)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper)
install(
FILES convert-hf-to-gguf.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR})
configure_file(cmake/whisper.pc.in
"${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
@ONLY)
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
DESTINATION lib/pkgconfig)
#
# programs, examples and tests
#
if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
enable_testing()
add_subdirectory(tests)
#include(CTest)
#add_subdirectory(tests)
endif ()
if (WHISPER_BUILD_EXAMPLES)

1252
Makefile

File diff suppressed because it is too large Load Diff

View File

@ -27,17 +27,15 @@ let package = Package(
"samples",
"tests",
"CMakeLists.txt",
"ggml-cuda.cu",
"ggml-cuda.h",
"Makefile"
],
sources: [
"ggml.c",
"whisper.cpp",
"ggml-alloc.c",
"ggml-backend.c",
"ggml-quants.c",
"ggml-metal.m"
"ggml/src/ggml.c",
"src/whisper.cpp",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.c",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-metal.m"
],
resources: [.process("ggml-metal.metal")],
publicHeadersPath: "spm-headers",

View File

@ -418,7 +418,7 @@ Now build `whisper.cpp` with CUDA support:
```
make clean
WHISPER_CUDA=1 make -j
GGML_CUDA=1 make -j
```
## BLAS CPU support via OpenBLAS
@ -430,7 +430,7 @@ Now build `whisper.cpp` with OpenBLAS support:
```
make clean
WHISPER_OPENBLAS=1 make -j
GGML_OPENBLAS=1 make -j
```
## BLAS CPU support via Intel MKL

@ -1 +0,0 @@
Subproject commit a2085436c2eb796af90956b62bd64731f5e5b823

View File

@ -1,54 +0,0 @@
# Add new build types
# ReleaseGG - Release with enabled asserts
SET(CMAKE_CXX_FLAGS_RELEASEGG
"-O3"
CACHE STRING "Flags used by the c++ compiler during release builds with enabled asserts."
FORCE )
SET(CMAKE_C_FLAGS_RELEASEGG
"-O3"
CACHE STRING "Flags used by the compiler during release builds with enabled asserts."
FORCE )
SET(CMAKE_EXE_LINKER_FLAGS_RELEASEGG
""
CACHE STRING "Flags used for linking binaries during release builds with enabled asserts."
FORCE )
SET(CMAKE_SHARED_LINKER_FLAGS_RELEASEGG
""
CACHE STRING "Flags used by the shared libraries linker during release builds with enabled asserts."
FORCE )
MARK_AS_ADVANCED(
CMAKE_CXX_FLAGS_RELEASEGG
CMAKE_C_FLAGS_RELEASEGG
CMAKE_EXE_LINKER_FLAGS_RELEASEGG
CMAKE_SHARED_LINKER_FLAGS_RELEASEGG )
# RelWithDebInfoGG - RelWithDebInfo with enabled asserts
SET(CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
"-O2 -g"
CACHE STRING "Flags used by the c++ compiler during release builds with debug symbols and enabled asserts."
FORCE )
SET(CMAKE_C_FLAGS_RELWITHDEBINFOGG
"-O2 -g"
CACHE STRING "Flags used by the compiler during release builds with debug symbols and enabled asserts."
FORCE )
SET(CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
""
CACHE STRING "Flags used for linking binaries during release builds with debug symbols and enabled asserts."
FORCE )
SET(CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG
""
CACHE STRING "Flags used by the shared libraries linker during release builds with debug symbols and enabled asserts."
FORCE )
MARK_AS_ADVANCED(
CMAKE_CXX_FLAGS_RELWITHDEBINFOGG
CMAKE_C_FLAGS_RELWITHDEBINFOGG
CMAKE_EXE_LINKER_FLAGS_RELWITHDEBINFOGG
CMAKE_SHARED_LINKER_FLAGS_RELWITHDEBINFOGG )
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo" "ReleaseGG" "RelWithDebInfoGG")
endif()

58
cmake/build-info.cmake Normal file
View File

@ -0,0 +1,58 @@
set(BUILD_NUMBER 0)
set(BUILD_COMMIT "unknown")
set(BUILD_COMPILER "unknown")
set(BUILD_TARGET "unknown")
# Look for git
find_package(Git)
if(NOT Git_FOUND)
find_program(GIT_EXECUTABLE NAMES git git.exe)
if(GIT_EXECUTABLE)
set(Git_FOUND TRUE)
message(STATUS "Found Git: ${GIT_EXECUTABLE}")
else()
message(WARNING "Git not found. Build info will not be accurate.")
endif()
endif()
# Get the commit count and hash
if(Git_FOUND)
execute_process(
COMMAND ${GIT_EXECUTABLE} rev-parse --short HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE HEAD
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RES
)
if (RES EQUAL 0)
set(BUILD_COMMIT ${HEAD})
endif()
execute_process(
COMMAND ${GIT_EXECUTABLE} rev-list --count HEAD
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
OUTPUT_VARIABLE COUNT
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE RES
)
if (RES EQUAL 0)
set(BUILD_NUMBER ${COUNT})
endif()
endif()
if(MSVC)
set(BUILD_COMPILER "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
set(BUILD_TARGET ${CMAKE_VS_PLATFORM_NAME})
else()
execute_process(
COMMAND sh -c "$@ --version | head -1" _ ${CMAKE_C_COMPILER}
OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
set(BUILD_COMPILER ${OUT})
execute_process(
COMMAND ${CMAKE_C_COMPILER} -dumpmachine
OUTPUT_VARIABLE OUT
OUTPUT_STRIP_TRAILING_WHITESPACE
)
set(BUILD_TARGET ${OUT})
endif()

View File

@ -0,0 +1,65 @@
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(GGML_BLAS @GGML_BLAS@)
set(GGML_CUDA @GGML_CUDA@)
set(GGML_METAL @GGML_METAL@)
set(GGML_HIPBLAS @GGML_HIPBLAS@)
set(GGML_ACCELERATE @GGML_ACCELERATE@)
@PACKAGE_INIT@
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
# Ensure transient dependencies satisfied
find_package(Threads REQUIRED)
if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate REQUIRED)
endif()
if (GGML_BLAS)
find_package(BLAS REQUIRED)
endif()
if (GGML_CUDA)
find_package(CUDAToolkit REQUIRED)
endif()
if (GGML_METAL)
find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
find_library(METAL_FRAMEWORK Metal REQUIRED)
find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
endif()
if (GGML_HIPBLAS)
find_package(hip REQUIRED)
find_package(hipblas REQUIRED)
find_package(rocblas REQUIRED)
endif()
find_library(llama_LIBRARY llama
REQUIRED
HINTS ${LLAMA_LIB_DIR})
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
add_library(llama UNKNOWN IMPORTED)
set_target_properties(llama
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${llama_LIBRARY}"
INTERFACE_COMPILE_FEATURES cxx_std_11
POSITION_INDEPENDENT_CODE ON )
check_required_components(Llama)

10
cmake/whisper.pc.in Normal file
View File

@ -0,0 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
includedir=${prefix}/include
Name: whisper
Description: Port of OpenAI's Whisper model in C/C++
Version: @PROJECT_VERSION@
Libs: -L${libdir} -lwhisper
Cflags: -I${includedir}

View File

@ -18,7 +18,7 @@ struct whisper_params {
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
@ -58,7 +58,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, "\n");
}
int whisper_bench_full(const whisper_params & params) {
static int whisper_bench_full(const whisper_params & params) {
// whisper init
struct whisper_context_params cparams = whisper_context_default_params();

View File

@ -59,7 +59,7 @@ struct whisper_params {
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
@ -130,7 +130,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, "\n");
}
std::string transcribe(
static std::string transcribe(
whisper_context * ctx,
const whisper_params & params,
const std::vector<float> & pcmf32,
@ -216,7 +216,7 @@ std::string transcribe(
return result;
}
std::vector<std::string> read_allowed_commands(const std::string & fname) {
static std::vector<std::string> read_allowed_commands(const std::string & fname) {
std::vector<std::string> allowed_commands;
std::ifstream ifs(fname);
@ -238,7 +238,7 @@ std::vector<std::string> read_allowed_commands(const std::string & fname) {
return allowed_commands;
}
std::vector<std::string> get_words(const std::string &txt) {
static std::vector<std::string> get_words(const std::string &txt) {
std::vector<std::string> words;
std::istringstream iss(txt);
@ -252,7 +252,7 @@ std::vector<std::string> get_words(const std::string &txt) {
// command-list mode
// guide the transcription to match the most likely command from a provided list
int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
static int process_command_list(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
fprintf(stderr, "\n");
fprintf(stderr, "%s: guided mode\n", __func__);
@ -463,7 +463,7 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
// always-prompt mode
// transcribe the voice into text after valid prompt
int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
static int always_prompt_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
bool is_running = true;
bool ask_prompt = true;
@ -543,7 +543,7 @@ int always_prompt_transcription(struct whisper_context * ctx, audio_async & audi
// general-purpose mode
// freely transcribe the voice into text
int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
static int process_general_transcription(struct whisper_context * ctx, audio_async & audio, const whisper_params & params) {
bool is_running = true;
bool have_prompt = false;
bool ask_prompt = true;

View File

@ -219,7 +219,7 @@ bool sdl_poll_events() {
case SDL_QUIT:
{
return false;
} break;
}
default:
break;
}

View File

@ -9,7 +9,7 @@
namespace grammar_parser {
// NOTE: assumes valid utf8 (but checks for overrun)
// copied from whisper.cpp
std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static std::pair<uint32_t, const char *> decode_utf8(const char * src) {
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t first_byte = static_cast<uint8_t>(*src);
uint8_t highbits = first_byte >> 4;
@ -24,19 +24,19 @@ namespace grammar_parser {
return std::make_pair(value, pos);
}
uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
auto result = state.symbol_ids.insert(std::make_pair(std::string(src, len), next_id));
return result.first->second;
}
uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) {
uint32_t next_id = static_cast<uint32_t>(state.symbol_ids.size());
state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id;
return next_id;
}
void add_rule(
static void add_rule(
parse_state & state,
uint32_t rule_id,
const std::vector<whisper_grammar_element> & rule) {
@ -46,11 +46,11 @@ namespace grammar_parser {
state.rules[rule_id] = rule;
}
bool is_word_char(char c) {
static bool is_word_char(char c) {
return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || ('0' <= c && c <= '9');
}
std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
static std::pair<uint32_t, const char *> parse_hex(const char * src, int size) {
const char * pos = src;
const char * end = src + size;
uint32_t value = 0;
@ -73,7 +73,7 @@ namespace grammar_parser {
return std::make_pair(value, pos);
}
const char * parse_space(const char * src, bool newline_ok) {
static const char * parse_space(const char * src, bool newline_ok) {
const char * pos = src;
while (*pos == ' ' || *pos == '\t' || *pos == '#' ||
(newline_ok && (*pos == '\r' || *pos == '\n'))) {
@ -88,7 +88,7 @@ namespace grammar_parser {
return pos;
}
const char * parse_name(const char * src) {
static const char * parse_name(const char * src) {
const char * pos = src;
while (is_word_char(*pos)) {
pos++;
@ -99,7 +99,7 @@ namespace grammar_parser {
return pos;
}
std::pair<uint32_t, const char *> parse_char(const char * src) {
static std::pair<uint32_t, const char *> parse_char(const char * src) {
if (*src == '\\') {
switch (src[1]) {
case 'x': return parse_hex(src + 2, 2);
@ -122,14 +122,14 @@ namespace grammar_parser {
throw std::runtime_error("unexpected end of input");
}
const char * parse_alternates(
static const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested);
const char * parse_sequence(
static const char * parse_sequence(
parse_state & state,
const char * src,
const std::string & rule_name,
@ -229,7 +229,7 @@ namespace grammar_parser {
return pos;
}
const char * parse_alternates(
static const char * parse_alternates(
parse_state & state,
const char * src,
const std::string & rule_name,
@ -247,7 +247,7 @@ namespace grammar_parser {
return pos;
}
const char * parse_rule(parse_state & state, const char * src) {
static const char * parse_rule(parse_state & state, const char * src) {
const char * name_end = parse_name(src);
const char * pos = parse_space(name_end, false);
size_t name_len = name_end - src;
@ -285,7 +285,7 @@ namespace grammar_parser {
}
}
void print_grammar_char(FILE * file, uint32_t c) {
static void print_grammar_char(FILE * file, uint32_t c) {
if (0x20 <= c && c <= 0x7f) {
fprintf(file, "%c", static_cast<char>(c));
} else {
@ -294,7 +294,7 @@ namespace grammar_parser {
}
}
bool is_char_element(whisper_grammar_element elem) {
static bool is_char_element(whisper_grammar_element elem) {
switch (elem.type) {
case WHISPER_GRETYPE_CHAR: return true;
case WHISPER_GRETYPE_CHAR_NOT: return true;
@ -304,7 +304,7 @@ namespace grammar_parser {
}
}
void print_rule_binary(FILE * file, const std::vector<whisper_grammar_element> & rule) {
static void print_rule_binary(FILE * file, const std::vector<whisper_grammar_element> & rule) {
for (auto elem : rule) {
switch (elem.type) {
case WHISPER_GRETYPE_END: fprintf(file, "END"); break;
@ -334,7 +334,7 @@ namespace grammar_parser {
fprintf(file, "\n");
}
void print_rule(
static void print_rule(
FILE * file,
uint32_t rule_id,
const std::vector<whisper_grammar_element> & rule,
@ -413,7 +413,7 @@ namespace grammar_parser {
}
}
std::vector<const whisper_grammar_element *> parse_state::c_rules() const{
std::vector<const whisper_grammar_element *> parse_state::c_rules() const {
std::vector<const whisper_grammar_element *> ret;
for (const auto & rule : rules) {
ret.push_back(rule.data());

View File

@ -53,7 +53,7 @@ struct commandset {
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
@ -109,7 +109,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, "\n");
}
uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
static uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
using namespace std::chrono;
uint64_t time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
uint64_t start_time = time_now;
@ -153,7 +153,7 @@ uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params &
return time_now;
}
json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params &params) {
static json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params &params) {
std::vector<whisper_token> prompt_tokens;
std::vector<float> pcmf32;
uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 10000U, pcmf32);
@ -199,7 +199,7 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
// command-list mode
// guide the transcription to match the most likely command from a provided list
json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, json jparams, std::vector<struct commandset> commandset_list) {
static json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, json jparams, std::vector<struct commandset> commandset_list) {
struct commandset cs = commandset_list[jparams.value("commandset_index", commandset_list.size()-1)];
std::vector<float> pcmf32;
uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 2000U, pcmf32);
@ -285,7 +285,7 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
}
}
json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
static json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
// TODO: check for token collision
struct commandset cs;
@ -325,7 +325,8 @@ json register_commandset(struct whisper_context * ctx, json jparams, std::vector
commandset_list.push_back(cs);
return json{{"index",index}};
}
json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
static json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
// whisper_state has the pertinent offsets, but there also seem to be a large
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
// I'll give this a another pass once everything else is implemented,
@ -335,7 +336,8 @@ json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*para
{"message", "Seeking is not yet supported."}
};
}
json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::vector<struct commandset> &commandset_list) {
static json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::vector<struct commandset> &commandset_list) {
// See: https://www.jsonrpc.org/specification
json id = body.at("id");
try {
@ -375,7 +377,7 @@ json parse_job(const json &body, struct whisper_context * ctx, audio_async &audi
}
}
void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
static void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
std::deque<json> jobqueue;
std::vector<struct commandset> commandset_list;
while (true) {

View File

@ -17,7 +17,7 @@
#endif
// helper function to replace substrings
void replace_all(std::string & s, const std::string & search, const std::string & replace) {
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
for (size_t pos = 0; ; pos += replace.length()) {
pos = s.find(search, pos);
if (pos == std::string::npos) break;
@ -94,17 +94,17 @@ struct whisper_params {
grammar_parser::parse_state grammar_parsed;
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
static void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
char* whisper_param_turn_lowercase(char* in){
static char * whisper_param_turn_lowercase(char * in){
int string_len = strlen(in);
for(int i = 0; i < string_len; i++){
for (int i = 0; i < string_len; i++){
*(in+i) = tolower((unsigned char)*(in+i));
}
return in;
}
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
@ -182,7 +182,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
static void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options] file0.wav file1.wav ...\n", argv[0]);
fprintf(stderr, "\n");
@ -248,7 +248,7 @@ struct whisper_print_user_data {
int progress_prev;
};
std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
static std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
std::string speaker = "";
const int64_t n_samples = pcmf32s[0].size();
@ -280,7 +280,8 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
return speaker;
}
void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
static void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
if (progress >= *progress_prev + progress_step) {
@ -289,7 +290,7 @@ void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct wh
}
}
void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
static void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
const auto & params = *((whisper_print_user_data *) user_data)->params;
const auto & pcmf32s = *((whisper_print_user_data *) user_data)->pcmf32s;
@ -358,7 +359,7 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
}
}
bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
static bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
std::ofstream fout(fname);
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -385,7 +386,7 @@ bool output_txt(struct whisper_context * ctx, const char * fname, const whisper_
return true;
}
bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
static bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
std::ofstream fout(fname);
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -417,7 +418,7 @@ bool output_vtt(struct whisper_context * ctx, const char * fname, const whisper_
return true;
}
bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
static bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
std::ofstream fout(fname);
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -446,7 +447,7 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
return true;
}
char *escape_double_quotes_and_backslashes(const char *str) {
static char * escape_double_quotes_and_backslashes(const char * str) {
if (str == NULL) {
return NULL;
}
@ -459,7 +460,7 @@ char *escape_double_quotes_and_backslashes(const char *str) {
}
}
char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
char * escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
if (escaped == NULL) {
return NULL;
}
@ -478,7 +479,7 @@ char *escape_double_quotes_and_backslashes(const char *str) {
}
// double quote should be escaped by another double quote. (rfc4180)
char *escape_double_quotes_in_csv(const char *str) {
static char * escape_double_quotes_in_csv(const char * str) {
if (str == NULL) {
return NULL;
}
@ -509,7 +510,7 @@ char *escape_double_quotes_in_csv(const char *str) {
return escaped;
}
bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
static bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
std::ofstream fout(fname);
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -544,7 +545,7 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
return true;
}
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
static bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
std::ofstream fout(fname);
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
@ -563,7 +564,7 @@ bool output_score(struct whisper_context * ctx, const char * fname, const whispe
return true;
}
bool output_json(
static bool output_json(
struct whisper_context * ctx,
const char * fname,
const whisper_params & params,
@ -734,7 +735,7 @@ bool output_json(
// karaoke video generation
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
// TODO: font parameter adjustments
bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
static bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec, std::vector<std::vector<float>> pcmf32s) {
std::ofstream fout(fname);
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
@ -859,7 +860,7 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f
return true;
}
bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
static bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
std::ofstream fout(fname);
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
@ -900,7 +901,7 @@ bool output_lrc(struct whisper_context * ctx, const char * fname, const whisper_
}
void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
static void cb_log_disable(enum ggml_log_level , const char * , void * ) { }
int main(int argc, char ** argv) {
whisper_params params;

View File

@ -36,7 +36,7 @@ struct whisper_filters {
};
// quantize a model
bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
static bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());

View File

@ -44,7 +44,7 @@ struct whisper_params {
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];

File diff suppressed because it is too large Load Diff

View File

@ -174,6 +174,7 @@ extern "C" {
LLAMA_POOLING_TYPE_NONE = 0,
LLAMA_POOLING_TYPE_MEAN = 1,
LLAMA_POOLING_TYPE_CLS = 2,
LLAMA_POOLING_TYPE_LAST = 3,
};
enum llama_split_mode {
@ -293,7 +294,6 @@ extern "C" {
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
// (ignored if no pooling layer)
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
@ -786,6 +786,10 @@ extern "C" {
// Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
// Set whether to use causal attention or not
// If set to true, the model will only attend to the past tokens
LLAMA_API void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn);

View File

@ -16,7 +16,7 @@
#include <regex>
#include <sstream>
std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos) {
auto * model = llama_get_model(ctx);
// upper limit for the number of tokens
@ -33,7 +33,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
return result;
}
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
if (n_tokens < 0) {
@ -83,7 +83,7 @@ struct whisper_params {
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
@ -168,7 +168,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, "\n");
}
std::string transcribe(
static std::string transcribe(
whisper_context * ctx,
const whisper_params & params,
const std::vector<float> & pcmf32,
@ -235,7 +235,7 @@ std::string transcribe(
return result;
}
std::vector<std::string> get_words(const std::string &txt) {
static std::vector<std::string> get_words(const std::string &txt) {
std::vector<std::string> words;
std::istringstream iss(txt);

File diff suppressed because it is too large Load Diff

View File

@ -226,8 +226,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
assert(offset_end <= cpts.size());
start = offset_end;
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
};
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
@ -309,7 +310,7 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
}
// regex: \s+(?!\S)
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
pos += num_whitespaces - 1;
_add_token(pos);
continue;
@ -344,8 +345,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
assert(offset_end <= cpts.size());
start = offset_end;
static const uint32_t OUT_OF_RANGE = 0xFFFFFFFF;
auto _get_cpt = [&] (const size_t pos) -> uint32_t {
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : OUT_OF_RANGE;
};
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
@ -450,7 +452,7 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
}
// regex: \s+(?!\S)
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != OUT_OF_RANGE) {
pos += num_whitespaces - 1;
_add_token(pos);
continue;
@ -594,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
std::vector<uint32_t> result;
result.reserve(utf8.size());
size_t offset = 0;
while (offset < utf8.size()) {
result.push_back(unicode_cpt_from_utf8(utf8, offset));
@ -679,10 +682,14 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
continue;
}
const int cpt_flag = unicode_cpt_flags(cpts[i]).category_flag();
const auto flags = unicode_cpt_flags(cpts[i]);
if (k_ucat_cpt.find(cpt_flag) != k_ucat_cpt.end()) {
text_collapsed[i] = k_ucat_cpt.at(cpt_flag);
if (flags.is_whitespace) {
//NOTE: C++ std::regex \s does not mach 0x85, Rust and Python regex does.
//text_collapsed[i] = (char) 0x85; // <Next Line> as whitespace fallback
text_collapsed[i] = (char) 0x0B; // <vertical tab> as whitespace fallback
} else if (k_ucat_cpt.find(flags.category_flag()) != k_ucat_cpt.end()) {
text_collapsed[i] = k_ucat_cpt.at(flags.category_flag());
} else {
text_collapsed[i] = (char) 0xD0; // fallback
}
@ -766,9 +773,16 @@ std::vector<std::string> unicode_regex_split(const std::string & text, const std
bpe_offsets = unicode_regex_split_stl(text_collapsed, regex_expr_collapsed, bpe_offsets);
} else {
// no unicode category used, we can use std::wregex directly
const std::wstring wtext = unicode_wstring_from_utf8(text);
const std::wstring wregex_expr = unicode_wstring_from_utf8(regex_expr);
// std::wregex \s does not mach non-ASCII whitespaces, using 0x0B as fallback
std::wstring wtext(cpts.begin(), cpts.end());
for (size_t i = 0; i < wtext.size(); ++i) {
if (wtext[i] > 0x7F && unicode_cpt_flags(wtext[i]).is_whitespace) {
wtext[i] = 0x0B;
}
}
//printf("text: %s\n", text.c_str());
//printf("regex_expr: %s\n", regex_expr.c_str());
bpe_offsets = unicode_regex_split_stl(wtext, wregex_expr, bpe_offsets);

View File

@ -72,7 +72,7 @@ struct gpt2_model {
};
// load the model's weights from a file
bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
static bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
@ -380,7 +380,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// - embd_w: the predicted logits for the next token
//
// TODO: sync latest version from ggml repo
bool gpt2_eval(
static bool gpt2_eval(
const gpt2_model & model,
const int n_threads,
const int n_past,

View File

@ -44,7 +44,7 @@ struct whisper_params {
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
static bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
@ -109,7 +109,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, "\n");
}
std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
static std::string transcribe(whisper_context * ctx, const whisper_params & params, const std::vector<float> & pcmf32, float & prob, int64_t & t_ms) {
const auto t_start = std::chrono::high_resolution_clock::now();
prob = 0.0f;

View File

@ -5,15 +5,14 @@ project(whisper.cpp)
set(CMAKE_CXX_STANDARD 11)
set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
set(
SOURCE_FILES
${WHISPER_LIB_DIR}/ggml.c
${WHISPER_LIB_DIR}/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml-backend.c
${WHISPER_LIB_DIR}/ggml-quants.c
${WHISPER_LIB_DIR}/whisper.cpp
${CMAKE_SOURCE_DIR}/jni.c
)
set(SOURCE_FILES
${WHISPER_LIB_DIR}/ggml/src/ggml.c
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
${WHISPER_LIB_DIR}/src/whisper.cpp
${CMAKE_SOURCE_DIR}/jni.c
)
find_library(LOG_LIB log)
@ -41,7 +40,6 @@ function(build_library target_name)
#target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
#target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
#target_link_options(${target_name} PRIVATE -flto)
endif ()
endfunction()
@ -54,3 +52,7 @@ elseif (${ANDROID_ABI} STREQUAL "armeabi-v7a")
endif ()
include_directories(${WHISPER_LIB_DIR})
include_directories(${WHISPER_LIB_DIR}/src)
include_directories(${WHISPER_LIB_DIR}/include)
include_directories(${WHISPER_LIB_DIR}/ggml/include)
include_directories(${WHISPER_LIB_DIR}/ggml/src)

View File

@ -10,7 +10,7 @@ option(GGML_HOME "whisper: Path to external GGML source" OFF)
set(
SOURCE_FILES
${WHISPER_LIB_DIR}/whisper.cpp
${WHISPER_LIB_DIR}/src/whisper.cpp
${CMAKE_SOURCE_DIR}/jni.c
)
@ -18,10 +18,10 @@ if (NOT GGML_HOME)
set(
SOURCE_FILES
${SOURCE_FILES}
${WHISPER_LIB_DIR}/ggml.c
${WHISPER_LIB_DIR}/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml-backend.c
${WHISPER_LIB_DIR}/ggml-quants.c
${WHISPER_LIB_DIR}/ggml/src/ggml.c
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
)
endif()
@ -75,3 +75,7 @@ endif ()
build_library("whisper") # Default target
include_directories(${WHISPER_LIB_DIR})
include_directories(${WHISPER_LIB_DIR}/src)
include_directories(${WHISPER_LIB_DIR}/include)
include_directories(${WHISPER_LIB_DIR}/ggml/include)
include_directories(${WHISPER_LIB_DIR}/ggml/src)

View File

@ -9,7 +9,6 @@
/* Begin PBXBuildFile section */
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; };
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 1844471B2AB21655007D6BFE /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; };
184447212AB21B43007D6BFE /* ggml-metal.metal in CopyFiles */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7D29052BDF00BD2A04 /* SceneDelegate.m */; };
18627C8129052BDF00BD2A04 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8029052BDF00BD2A04 /* ViewController.m */; };
@ -20,6 +19,8 @@
18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK -DGGML_USE_METAL"; }; };
18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL"; }; };
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
@ -29,23 +30,24 @@
/* End PBXBuildFile section */
/* Begin PBXCopyFilesBuildPhase section */
184447202AB21B25007D6BFE /* CopyFiles */ = {
184447202AB21B25007D6BFE /* Copy Files */ = {
isa = PBXCopyFilesBuildPhase;
buildActionMask = 2147483647;
dstPath = "";
dstSubfolderSpec = 7;
files = (
184447212AB21B43007D6BFE /* ggml-metal.metal in CopyFiles */,
18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */,
);
name = "Copy Files";
runOnlyForDeploymentPostprocessing = 0;
};
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml-alloc.c"; sourceTree = "<group>"; };
184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml-alloc.h"; sourceTree = "<group>"; };
1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml-metal.m"; sourceTree = "<group>"; };
1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml-metal.metal"; sourceTree = "<group>"; };
184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; };
1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal.metal"; sourceTree = "<group>"; };
18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; };
18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@ -58,17 +60,19 @@
18627C8829052BE000BD2A04 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
18627C8A29052BE000BD2A04 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
18627C8B29052BE000BD2A04 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
18627C9229052C2B00BD2A04 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = whisper.h; path = ../../../whisper.h; sourceTree = "<group>"; };
18627C9329052C4900BD2A04 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = whisper.cpp; path = ../../../whisper.cpp; sourceTree = "<group>"; };
18627C9529052C5800BD2A04 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml.c; sourceTree = "<group>"; };
18627C9729052C6600BD2A04 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml.h; sourceTree = "<group>"; };
18627C9229052C2B00BD2A04 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = whisper.h; path = ../../../include/whisper.h; sourceTree = "<group>"; };
18627C9329052C4900BD2A04 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = whisper.cpp; path = ../../../src/whisper.cpp; sourceTree = "<group>"; };
18627C9529052C5800BD2A04 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml/src/ggml.c; sourceTree = "<group>"; };
18627C9729052C6600BD2A04 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml/include/ggml.h; sourceTree = "<group>"; };
18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = "ggml-base.en.bin"; path = "../../../models/ggml-base.en.bin"; sourceTree = "<group>"; };
18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml-quants.h"; sourceTree = "<group>"; };
18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml-backend.h"; sourceTree = "<group>"; };
18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml-backend-impl.h"; sourceTree = "<group>"; };
18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml-backend.c"; sourceTree = "<group>"; };
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml-impl.h"; sourceTree = "<group>"; };
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml-quants.c"; sourceTree = "<group>"; };
18A275FE2C2A94DE001C8D37 /* ggml-metal.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-metal.h"; path = "../../../ggml/include/ggml-metal.h"; sourceTree = "<group>"; };
18A275FF2C2A9563001C8D37 /* ggml-common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-common.h"; path = "../../../ggml/src/ggml-common.h"; sourceTree = "<group>"; };
18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml/src/ggml-backend.c"; sourceTree = "<group>"; };
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = "<group>"; };
7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = "<group>"; };
@ -108,6 +112,8 @@
18627C7829052BDF00BD2A04 /* whisper.objc */ = {
isa = PBXGroup;
children = (
18A275FF2C2A9563001C8D37 /* ggml-common.h */,
18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
18ABE1572AF556340044A204 /* ggml-backend.c */,
18ABE1552AF556340044A204 /* ggml-backend.h */,
@ -151,7 +157,7 @@
7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */,
);
name = coreml;
path = ../../../coreml;
path = ../../../src/coreml;
sourceTree = "<group>";
};
/* End PBXGroup section */
@ -164,7 +170,7 @@
18627C7229052BDF00BD2A04 /* Sources */,
18627C7329052BDF00BD2A04 /* Frameworks */,
18627C7429052BDF00BD2A04 /* Resources */,
184447202AB21B25007D6BFE /* CopyFiles */,
184447202AB21B25007D6BFE /* Copy Files */,
);
buildRules = (
);
@ -182,7 +188,7 @@
isa = PBXProject;
attributes = {
BuildIndependentTargetsInParallel = 1;
LastUpgradeCheck = 1400;
LastUpgradeCheck = 1540;
TargetAttributes = {
18627C7529052BDF00BD2A04 = {
CreatedOnToolsVersion = 14.0.1;
@ -212,6 +218,7 @@
isa = PBXResourcesBuildPhase;
buildActionMask = 2147483647;
files = (
18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */,
18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */,
7FE3424F2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc in Resources */,
18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */,
@ -301,6 +308,7 @@
DEBUG_INFORMATION_FORMAT = dwarf;
ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_TESTABILITY = YES;
ENABLE_USER_SCRIPT_SANDBOXING = YES;
GCC_C_LANGUAGE_STANDARD = gnu11;
GCC_DYNAMIC_NO_PIC = NO;
GCC_NO_COMMON_BLOCKS = YES;
@ -359,6 +367,7 @@
DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
ENABLE_NS_ASSERTIONS = NO;
ENABLE_STRICT_OBJC_MSGSEND = YES;
ENABLE_USER_SCRIPT_SANDBOXING = YES;
GCC_C_LANGUAGE_STANDARD = gnu11;
GCC_NO_COMMON_BLOCKS = YES;
GCC_WARN_64_TO_32_BIT_CONVERSION = YES;
@ -400,6 +409,7 @@
"@executable_path/Frameworks",
);
MARKETING_VERSION = 1.0;
MTL_HEADER_SEARCH_PATHS = "";
PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES;
@ -428,6 +438,7 @@
"@executable_path/Frameworks",
);
MARKETING_VERSION = 1.0;
MTL_HEADER_SEARCH_PATHS = "";
PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
PRODUCT_NAME = "$(TARGET_NAME)";
SWIFT_EMIT_LOC_STRINGS = YES;

View File

@ -15,7 +15,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
private var audioPlayer: AVAudioPlayer?
private var modelUrl: URL? {
Bundle.main.url(forResource: "ggml-tiny.en", withExtension: "bin", subdirectory: "models")
Bundle.main.url(forResource: "ggml-base.en", withExtension: "bin", subdirectory: "models")
}
private var sampleUrl: URL? {

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1
ggml/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
src/ggml-metal-embed.metal

238
ggml/CMakeLists.txt Normal file
View File

@ -0,0 +1,238 @@
cmake_minimum_required(VERSION 3.14) # for add_link_options and implicit target directories.
project("ggml" C CXX)
include(CheckIncludeFileCXX)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
endif()
if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
set(GGML_STANDALONE ON)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
# configure project version
# TODO
else()
set(GGML_STANDALONE OFF)
endif()
if (EMSCRIPTEN)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
option(GGML_WASM_SINGLE_FILE "ggml: embed WASM inside the generated ggml.js" ON)
else()
if (MINGW)
set(BUILD_SHARED_LIBS_DEFAULT OFF)
else()
set(BUILD_SHARED_LIBS_DEFAULT ON)
endif()
endif()
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
#
# option list
#
# TODO: mark all options as advanced when not GGML_STANDALONE
if (APPLE)
set(GGML_METAL_DEFAULT ON)
set(GGML_BLAS_DEFAULT ON)
set(GGML_BLAS_VENDOR_DEFAULT "Apple")
else()
set(GGML_METAL_DEFAULT OFF)
set(GGML_BLAS_DEFAULT OFF)
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
endif()
# general
option(GGML_STATIC "ggml: static link libraries" OFF)
option(GGML_NATIVE "ggml: enable -march=native flag" ON)
option(GGML_LTO "ggml: enable link time optimization" OFF)
option(GGML_CCACHE "ggml: use ccache if available" ON)
# debug
option(GGML_ALL_WARNINGS "ggml: enable all compiler warnings" ON)
option(GGML_ALL_WARNINGS_3RD_PARTY "ggml: enable all compiler warnings in 3rd party libs" OFF)
option(GGML_GPROF "ggml: enable gprof" OFF)
# build
option(GGML_FATAL_WARNINGS "ggml: enable -Werror flag" OFF)
# sanitizers
option(GGML_SANITIZE_THREAD "ggml: enable thread sanitizer" OFF)
option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
# instruction set specific
if (GGML_NATIVE)
set(INS_ENB OFF)
else()
set(INS_ENB ON)
endif()
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
option(GGML_AVX512 "ggml: enable AVX512" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
if (NOT MSVC)
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
endif()
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_SVE "ggml: enable SVE" OFF)
if (WIN32)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
endif()
# ggml core
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
# 3rd party libs / backends
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF)
option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
"ggml: iters./thread per block for Q2_K/Q6_K")
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"ggml: max. batch size for using peer access")
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
set (GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
"ggml: metal minimum macOS version")
set (GGML_METAL_STD "" CACHE STRING "ggml: metal standard version (-std flag)")
option(GGML_OPENMP "ggml: use OpenMP" ON)
option(GGML_RPC "ggml: use RPC" OFF)
option(GGML_SYCL "ggml: use SYCL" OFF)
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
# extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
#
# dependencies
#
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
if (GGML_SYCL)
set(CMAKE_CXX_STANDARD 17)
else()
set(CMAKE_CXX_STANDARD 11)
endif()
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
find_package(Threads REQUIRED)
#
# build the library
#
add_subdirectory(src)
#
# tests and examples
#
if (GGML_BUILD_TESTS)
enable_testing()
add_subdirectory(tests)
endif ()
if (GGML_BUILD_EXAMPLES)
add_subdirectory(examples)
endif ()
#
# install
#
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
set(GGML_PUBLIC_HEADERS
include/ggml.h
include/ggml-alloc.h
include/ggml-backend.h
"${GGML_HEADERS_CUDA}"
"${GGML_HEADERS_METAL}"
"${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
#if (GGML_METAL)
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
#endif()
install(TARGETS ggml PUBLIC_HEADER)
if (BUILD_SHARED_LIBS)
install(TARGETS ggml LIBRARY)
endif()
if (GGML_METAL)
install(
FILES src/ggml-metal.metal
PERMISSIONS
OWNER_READ
OWNER_WRITE
GROUP_READ
WORLD_READ
DESTINATION ${CMAKE_INSTALL_BINDIR})
if (NOT GGML_METAL_EMBED_LIBRARY)
install(
FILES ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib
DESTINATION ${CMAKE_INSTALL_BINDIR}
)
endif()
endif()
if (GGML_STANDALONE)
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/ggml.pc.in
${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
@ONLY)
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/ggml.pc
DESTINATION share/pkgconfig)
endif()

100
ggml/cmake/FindSIMD.cmake Normal file
View File

@ -0,0 +1,100 @@
include(CheckCSourceRuns)
set(AVX_CODE "
#include <immintrin.h>
int main()
{
__m256 a;
a = _mm256_set1_ps(0);
return 0;
}
")
set(AVX512_CODE "
#include <immintrin.h>
int main()
{
__m512i a = _mm512_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0);
__m512i b = a;
__mmask64 equality_mask = _mm512_cmp_epi8_mask(a, b, _MM_CMPINT_EQ);
return 0;
}
")
set(AVX2_CODE "
#include <immintrin.h>
int main()
{
__m256i a = {0};
a = _mm256_abs_epi16(a);
__m256i x;
_mm256_extract_epi64(x, 0); // we rely on this in our AVX2 code
return 0;
}
")
set(FMA_CODE "
#include <immintrin.h>
int main()
{
__m256 acc = _mm256_setzero_ps();
const __m256 d = _mm256_setzero_ps();
const __m256 p = _mm256_setzero_ps();
acc = _mm256_fmadd_ps( d, p, acc );
return 0;
}
")
macro(check_sse type flags)
set(__FLAG_I 1)
set(CMAKE_REQUIRED_FLAGS_SAVE ${CMAKE_REQUIRED_FLAGS})
foreach (__FLAG ${flags})
if (NOT ${type}_FOUND)
set(CMAKE_REQUIRED_FLAGS ${__FLAG})
check_c_source_runs("${${type}_CODE}" HAS_${type}_${__FLAG_I})
if (HAS_${type}_${__FLAG_I})
set(${type}_FOUND TRUE CACHE BOOL "${type} support")
set(${type}_FLAGS "${__FLAG}" CACHE STRING "${type} flags")
endif()
math(EXPR __FLAG_I "${__FLAG_I}+1")
endif()
endforeach()
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_SAVE})
if (NOT ${type}_FOUND)
set(${type}_FOUND FALSE CACHE BOOL "${type} support")
set(${type}_FLAGS "" CACHE STRING "${type} flags")
endif()
mark_as_advanced(${type}_FOUND ${type}_FLAGS)
endmacro()
# flags are for MSVC only!
check_sse("AVX" " ;/arch:AVX")
if (NOT ${AVX_FOUND})
set(GGML_AVX OFF)
else()
set(GGML_AVX ON)
endif()
check_sse("AVX2" " ;/arch:AVX2")
check_sse("FMA" " ;/arch:AVX2")
if ((NOT ${AVX2_FOUND}) OR (NOT ${FMA_FOUND}))
set(GGML_AVX2 OFF)
else()
set(GGML_AVX2 ON)
endif()
check_sse("AVX512" " ;/arch:AVX512")
if (NOT ${AVX512_FOUND})
set(GGML_AVX512 OFF)
else()
set(GGML_AVX512 ON)
endif()

View File

@ -0,0 +1,220 @@
#!/usr/bin/env python
import logging
import argparse
import asyncio
import os
from tempfile import gettempdir
logger = logging.getLogger("ggml-vk-generate-shaders")
GLSLC = "glslc"
type_names = [
"f32",
"f16",
"q4_0",
"q4_1",
"q5_0",
"q5_1",
"q8_0",
"q2_k",
"q3_k",
"q4_k",
"q5_k",
"q6_k",
]
ASYNCIO_CONCURRENCY = 64
input_dir = "vulkan-shaders"
output_dir = gettempdir()
lock = asyncio.Lock()
shader_fnames = []
async def string_to_spv(name, in_fname, defines, fp16=True):
name = f"{name}{'_fp32' if not fp16 else ''}"
out_fname = os.path.join(output_dir, f"{name}.spv")
in_path = os.path.join(input_dir, in_fname)
cmd = [GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname]
cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
stdout, stderr = await proc.communicate()
stdout = stdout.decode()
error = stderr.decode()
if proc.returncode:
cmd = " ".join(cmd)
logger.error(f"cannot compile {name}\n\n{cmd}\n\n{error}")
return
async with lock:
shader_fnames.append((name, out_fname))
def matmul_shaders(tasks, fp16, matmul_id):
if fp16:
load_vec = "8"
aligned_b_type_f32 = "mat2x4"
aligned_b_type_f16 = "f16mat2x4"
else:
load_vec = "4"
aligned_b_type_f32 = "vec4"
aligned_b_type_f16 = "f16vec4"
base_dict = {"FLOAT_TYPE": "float" if not fp16 else "float16_t"}
shader_name = "matmul"
if matmul_id:
base_dict["MUL_MAT_ID"] = "1"
shader_name = "matmul_id"
if fp16:
base_dict["FLOAT16"] = "1"
# Shaders with f16 B_TYPE
tasks.append(string_to_spv(f"{shader_name}_f32_f16", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_f32_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_f16", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
for tname in type_names:
data_a_key = f"DATA_A_{tname.upper()}"
load_vec_a = load_vec if tname in ("f32", "f16") else "2"
tasks.append(string_to_spv(f"{shader_name}_{tname}_f32", "mul_mm.comp", base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_{tname}_f32_aligned", "mul_mm.comp", base_dict | {data_a_key: "2", "LOAD_VEC_A": load_vec_a, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f32, "D_TYPE": "float"}, fp16))
async def main():
logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")
tasks = []
for fp16 in (False, True):
# MUL_MAT
matmul_shaders(tasks, fp16, False)
# MUL_MAT_ID
matmul_shaders(tasks, fp16, True)
for tname in type_names:
base_dict = {"FLOAT_TYPE": "float"}
# mul mat vec
data_a_key = f"DATA_A_{tname.upper()}"
shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp"
tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f32_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f16_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float16_t", "D_TYPE": "float"}))
tasks.append(string_to_spv(f"mul_mat_vec_id_{tname}_f32", shader, base_dict | {"MUL_MAT_ID": "1", data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
# Dequant shaders
if tname != "f16":
tasks.append(string_to_spv(f"dequant_{tname}", f"dequant_{tname}.comp", base_dict | {data_a_key: "1", "D_TYPE": "float16_t"}))
# get_rows
if not tname.endswith("_k"):
shader = "get_rows.comp" if tname in ("f32", "f16") else "get_rows_quant.comp"
if tname == "f16":
tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
else:
tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv(f"get_rows_{tname}_f32", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float"}))
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
# Norms
tasks.append(string_to_spv("norm_f32", "norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rms_norm_f32", "rms_norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("cpy_f32_f32", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("cpy_f32_f16", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("cpy_f16_f16", "copy.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
tasks.append(string_to_spv("add_f32", "add.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}))
tasks.append(string_to_spv("mul_f32", "mul.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("div_f32", "div.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("scale_f32", "scale.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("sqr_f32", "square.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("clamp_f32", "clamp.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("gelu_f32", "gelu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("silu_f32", "silu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("relu_f32", "relu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("soft_max_f32", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("soft_max_f32_f16", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float16_t", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_norm_f32", "rope_norm.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_norm_f16", "rope_norm.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("rope_neox_f32", "rope_neox.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_neox_f16", "rope_neox.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("argsort_f32", "argsort.comp", {"A_TYPE": "float"}))
tasks.append(string_to_spv("sum_rows_f32", "sum_rows.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
# Helper to decorate tasks with semaphore acquisition.
async def withSemaphore(sem, task):
async with sem:
return await task
# Run tasks concurrently guarded by a concurrency limit.
sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
with open("ggml-vulkan-shaders.hpp", "w") as f:
f.write("#include <cstdint>\n\n")
for name, path in sorted(shader_fnames):
with open(path, "rb") as spv:
counter = 0
newline_counter = 0
f.write(f"unsigned char {name}_data[] = {{\n")
for val in spv.read():
f.write(f"0x{val:02x},")
newline_counter += 1
counter += 1
if newline_counter >= 12:
newline_counter = 0
f.write("\n")
f.write("\n};\n")
f.write(f"const uint64_t {name}_len = {counter};\n\n")
os.remove(path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
parser.add_argument("--glslc", help="Path to glslc")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if args.glslc:
GLSLC = args.glslc
asyncio.run(main())

View File

@ -8,7 +8,9 @@
#include "ggml.h"
#include "ggml-backend.h"
#include "ggml-sycl/presets.hpp"
#define GGML_SYCL_NAME "SYCL"
#define GGML_SYCL_MAX_DEVICES 48
#ifdef __cplusplus
extern "C" {

View File

@ -312,6 +312,12 @@
GGML_TENSOR_LOCALS(int64_t, ne, dst, ne) \
GGML_TENSOR_LOCALS(size_t, nb, dst, nb)
#define GGML_TENSOR_BINARY_OP_LOCALS01 \
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne) \
GGML_TENSOR_LOCALS(size_t, nb0, src0, nb) \
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne) \
GGML_TENSOR_LOCALS(size_t, nb1, src1, nb)
#ifdef __cplusplus
extern "C" {
#endif
@ -585,11 +591,7 @@ extern "C" {
struct ggml_tensor * grad;
struct ggml_tensor * src[GGML_MAX_SRC];
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
// source tensor and offset for views
struct ggml_tensor * view_src;
size_t view_offs;
@ -599,7 +601,7 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu
char padding[8];
// char padding[4];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -646,11 +648,6 @@ extern "C" {
struct ggml_hash_set visited_hash_table;
enum ggml_cgraph_eval_order order;
// performance
int perf_runs;
int64_t perf_cycles;
int64_t perf_time_us;
};
// scratch buffer
@ -667,28 +664,6 @@ extern "C" {
bool no_alloc; // don't allocate memory for the tensor data
};
// compute types
// NOTE: the INIT or FINALIZE pass is not scheduled unless explicitly enabled.
// This behavior was changed since https://github.com/ggerganov/llama.cpp/pull/1995.
enum ggml_task_type {
GGML_TASK_TYPE_INIT = 0,
GGML_TASK_TYPE_COMPUTE,
GGML_TASK_TYPE_FINALIZE,
};
struct ggml_compute_params {
enum ggml_task_type type;
// ith = thread index, nth = number of threads
int ith, nth;
// work buffer for all threads
size_t wsize;
void * wdata;
};
// numa strategies
enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0,

1171
ggml/src/CMakeLists.txt Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1172,7 +1172,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
// check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1) {
for (int b = 0; b < src_backend_id; b++) {
if (ggml_backend_offload_op(sched->backends[b], tensor)) {
if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) {
SET_CAUSE(tensor, "1.off");
return b;
}

View File

@ -152,16 +152,16 @@ static ggml_cuda_device_info ggml_cuda_init() {
GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES);
int64_t total_vram = 0;
#if defined(GGML_CUDA_FORCE_MMQ)
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
#ifdef GGML_CUDA_FORCE_MMQ
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__);
#else
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
#endif
#if defined(CUDA_USE_TENSOR_CORES)
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: yes\n", __func__);
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_MMQ: no\n", __func__);
#endif // GGML_CUDA_FORCE_MMQ
#ifdef GGML_CUDA_FORCE_CUBLAS
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: yes\n", __func__);
#else
GGML_CUDA_LOG_INFO("%s: CUDA_USE_TENSOR_CORES: no\n", __func__);
#endif
GGML_CUDA_LOG_INFO("%s: GGML_CUDA_FORCE_CUBLAS: no\n", __func__);
#endif // GGML_CUDA_FORCE_CUBLAS
GGML_CUDA_LOG_INFO("%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count);
for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0;
@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
}
const int cc = ggml_cuda_info().devices[id].cc;
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
}
return row_rounding;
}
@ -1873,9 +1873,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
int64_t min_compute_capability = INT_MAX;
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
bool any_gpus_with_slow_fp16 = false;
bool any_pascal_with_slow_fp16 = false;
if (split) {
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
auto & tensor_split = buft_ctx->tensor_split;
@ -1885,55 +1893,18 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
continue;
}
if (min_compute_capability > ggml_cuda_info().devices[id].cc) {
min_compute_capability = ggml_cuda_info().devices[id].cc;
}
if (ggml_cuda_info().devices[id].cc == 610) {
any_pascal_with_slow_fp16 = true;
}
const int cc = ggml_cuda_info().devices[id].cc;
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
}
} else {
min_compute_capability = ggml_cuda_info().devices[ctx.device].cc;
any_pascal_with_slow_fp16 = ggml_cuda_info().devices[ctx.device].cc == 610;
const int cc = ggml_cuda_info().devices[ctx.device].cc;
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
}
// check data types and tensor shapes for custom matrix multiplication kernels:
bool use_dequantize_mul_mat_vec = (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src0->ne[0] % GGML_CUDA_DMMV_X == 0 && src1->ne[1] == 1;
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
bool use_mul_mat_q = ggml_cuda_supports_mmq(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
#ifdef CUDA_USE_TENSOR_CORES
use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
#endif // CUDA_USE_TENSOR_CORES
#else
// fp16 performance is good on Volta or newer and on P100 (compute capability 6.0)
const bool fp16_performance_good = min_compute_capability >= CC_PASCAL && !any_pascal_with_slow_fp16;
// mmvq and mmq need the __dp4a instruction which on NVIDIA is only available for CC >= 6.1
use_mul_mat_vec_q = use_mul_mat_vec_q && min_compute_capability >= MIN_CC_DP4A;
use_mul_mat_q = use_mul_mat_q && min_compute_capability >= MIN_CC_DP4A;
#ifdef CUDA_USE_TENSOR_CORES
// when tensor cores are available, use them for large batch size
// ref: https://github.com/ggerganov/llama.cpp/pull/3776
use_mul_mat_q = use_mul_mat_q && (!fp16_performance_good || src1->ne[1] <= MMQ_MAX_BATCH_SIZE);
#endif // CUDA_USE_TENSOR_CORES
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
// if mmvq is available it's a better choice than dmmv:
#ifndef GGML_CUDA_FORCE_DMMV
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
@ -1947,14 +1918,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// KQ single-batch
if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
// FP32 precision KQ single-batch for batch size 1 without FlashAttention
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
} else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || fp16_performance_good) && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch without FlashAttention
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
} else if (use_dequantize_mul_mat_vec) {
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
@ -2267,6 +2239,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_SQR:
ggml_cuda_op_sqr(ctx, dst);
break;
case GGML_OP_SQRT:
ggml_cuda_op_sqrt(ctx, dst);
break;
case GGML_OP_CLAMP:
ggml_cuda_op_clamp(ctx, dst);
break;
@ -2830,6 +2805,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_RMS_NORM:
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_SQRT:
case GGML_OP_CLAMP:
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:

View File

@ -146,23 +146,6 @@
#define CC_RDNA2 (CC_OFFSET_AMD + 1030)
#define CC_RDNA3 (CC_OFFSET_AMD + 1100)
// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
// - 7B quantum model: +100-200 MB
// - 13B quantum model: +200-400 MB
//
//#define GGML_CUDA_FORCE_MMQ
// TODO: improve this to be correct for more hardware
// for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
#if !defined(GGML_CUDA_FORCE_MMQ)
#define CUDA_USE_TENSOR_CORES
#endif
#define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels
#define MMQ_MAX_BATCH_SIZE 64 // max batch size to use MMQ kernels when tensor cores are available
#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses
#if defined(_MSC_VER)
@ -343,15 +326,15 @@ static __device__ __forceinline__ half2 __shfl_xor(half2 var, int laneMask, int
#define INT8_MMA_AVAILABLE
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
static bool fast_fp16_available(const int cc) {
static constexpr bool fast_fp16_available(const int cc) {
return cc >= CC_PASCAL && cc != 610;
}
static bool fp16_mma_available(const int cc) {
static constexpr bool fp16_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
}
static bool int8_mma_available(const int cc) {
static constexpr bool int8_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_TURING;
}
@ -643,19 +626,6 @@ struct ggml_cuda_type_traits<GGML_TYPE_IQ3_S> {
static constexpr int qi = QI3_S;
};
static int get_mmq_x_max_host(const int cc) {
#ifdef CUDA_USE_TENSOR_CORES
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? MMQ_MAX_BATCH_SIZE : 64;
#else
return cc >= CC_VOLTA && cc < CC_OFFSET_AMD ? 128 : 64;
#endif // CUDA_USE_TENSOR_CORES
}
// Round rows to this value for --split-mode row:
static int get_mmq_y_host(const int cc, const int mmq_x) {
return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
}
//////////////////////
struct ggml_cuda_device_info {

View File

@ -603,7 +603,7 @@ static void on_no_fattn_vec_case(const int D) {
if (D == 64) {
fprintf(stderr, "Unsupported KV type combination for head_size 64.\n");
fprintf(stderr, "By default only f16 KV cache is supported.\n");
fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for V cache quantization support.\n");
GGML_ASSERT(false);
} else if (D == 128) {
fprintf(stderr, "Unsupported KV type combination for head_size 128.\n");
@ -611,7 +611,7 @@ static void on_no_fattn_vec_case(const int D) {
fprintf(stderr, " - K == q4_0, V == q4_0, 4.50 BPV\n");
fprintf(stderr, " - K == q8_0, V == q8_0, 8.50 BPV\n");
fprintf(stderr, " - K == f16, V == f16, 16.00 BPV\n");
fprintf(stderr, "Compile with LLAMA_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
fprintf(stderr, "Compile with GGML_CUDA_FA_ALL_QUANTS for all combinations of q4_0, q4_1, q5_0, q5_1, q8_0, and f16.\n");
GGML_ASSERT(false);
} else {
fprintf(stderr, "Unsupported KV type combination for head_size 256.\n");

View File

@ -20,6 +20,20 @@ struct mma_int_A_I16K4 {
GGML_CUDA_ASSUME(ret < K);
return ret;
}
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE)
const int * xs = xs0 + (threadIdx.x%I)*stride;
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
: "+r"(x[0]), "+r"(x[1])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_i(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
};
struct mma_int_A_I16K8 {
@ -42,6 +56,20 @@ struct mma_int_A_I16K8 {
GGML_CUDA_ASSUME(ret < K);
return ret;
}
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE)
const int * xs = xs0 + (threadIdx.x%I)*stride + (threadIdx.x/I)*(K/2);
asm("ldmatrix.sync.aligned.m8n8.x4.b16 {%0, %1, %2, %3}, [%4];"
: "+r"(x[0]), "+r"(x[1]), "+r"(x[2]), "+r"(x[3])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_i(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
};
struct mma_int_B_J8K4 {
@ -64,6 +92,20 @@ struct mma_int_B_J8K4 {
GGML_CUDA_ASSUME(ret < K);
return ret;
}
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
const int * xs = xs0 + (threadIdx.x%J)*stride;
asm("ldmatrix.sync.aligned.m8n8.x1.b16 {%0}, [%1];"
: "+r"(x[0])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_j(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
};
struct mma_int_B_J8K8 {
@ -86,6 +128,20 @@ struct mma_int_B_J8K8 {
GGML_CUDA_ASSUME(ret < K);
return ret;
}
__device__ __forceinline__ void load(const int * __restrict__ xs0, const int & stride) {
#if defined(INT8_MMA_AVAILABLE) && false // Loading as 4 byte values is faster
const int * xs = xs0 + (threadIdx.x%J)*stride + ((threadIdx.x/J)*(K/2)) % K;
asm("ldmatrix.sync.aligned.m8n8.x2.b16 {%0, %1}, [%2];"
: "+r"(x[0]), "+r"(x[1])
: "l"(xs));
#else
#pragma unroll
for (int l = 0; l < ne; ++l) {
x[l] = xs0[get_j(l)*stride + get_k(l)];
}
#endif // defined(INT8_MMA_AVAILABLE)
}
};
struct mma_int_C_I16J8 {

View File

@ -30,34 +30,34 @@ void ggml_cuda_op_mul_mat_q(
switch (src0->type) {
case GGML_TYPE_Q4_0:
mul_mat_q_case<GGML_TYPE_Q4_0>(args, stream);
mul_mat_q_case<GGML_TYPE_Q4_0>(ctx, args, stream);
break;
case GGML_TYPE_Q4_1:
mul_mat_q_case<GGML_TYPE_Q4_1>(args, stream);
mul_mat_q_case<GGML_TYPE_Q4_1>(ctx, args, stream);
break;
case GGML_TYPE_Q5_0:
mul_mat_q_case<GGML_TYPE_Q5_0>(args, stream);
mul_mat_q_case<GGML_TYPE_Q5_0>(ctx, args, stream);
break;
case GGML_TYPE_Q5_1:
mul_mat_q_case<GGML_TYPE_Q5_1>(args, stream);
mul_mat_q_case<GGML_TYPE_Q5_1>(ctx, args, stream);
break;
case GGML_TYPE_Q8_0:
mul_mat_q_case<GGML_TYPE_Q8_0>(args, stream);
mul_mat_q_case<GGML_TYPE_Q8_0>(ctx, args, stream);
break;
case GGML_TYPE_Q2_K:
mul_mat_q_case<GGML_TYPE_Q2_K>(args, stream);
mul_mat_q_case<GGML_TYPE_Q2_K>(ctx, args, stream);
break;
case GGML_TYPE_Q3_K:
mul_mat_q_case<GGML_TYPE_Q3_K>(args, stream);
mul_mat_q_case<GGML_TYPE_Q3_K>(ctx, args, stream);
break;
case GGML_TYPE_Q4_K:
mul_mat_q_case<GGML_TYPE_Q4_K>(args, stream);
mul_mat_q_case<GGML_TYPE_Q4_K>(ctx, args, stream);
break;
case GGML_TYPE_Q5_K:
mul_mat_q_case<GGML_TYPE_Q5_K>(args, stream);
mul_mat_q_case<GGML_TYPE_Q5_K>(ctx, args, stream);
break;
case GGML_TYPE_Q6_K:
mul_mat_q_case<GGML_TYPE_Q6_K>(args, stream);
mul_mat_q_case<GGML_TYPE_Q6_K>(ctx, args, stream);
break;
default:
GGML_ASSERT(false);
@ -69,7 +69,13 @@ void ggml_cuda_op_mul_mat_q(
GGML_UNUSED(src1_ddf_i);
}
bool ggml_cuda_supports_mmq(enum ggml_type type) {
bool ggml_cuda_should_use_mmq(enum ggml_type type, int cc, int64_t ne11) {
#ifdef GGML_CUDA_FORCE_CUBLAS
return false;
#endif // GGML_CUDA_FORCE_CUBLAS
bool mmq_supported;
switch (type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
@ -81,8 +87,32 @@ bool ggml_cuda_supports_mmq(enum ggml_type type) {
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
return true;
mmq_supported = true;
break;
default:
return false;
mmq_supported = false;
break;
}
if (!mmq_supported) {
return false;
}
if (int8_mma_available(cc)) {
return true;
}
if (cc < MIN_CC_DP4A) {
return false;
}
#ifdef GGML_CUDA_FORCE_MMQ
return true;
#endif //GGML_CUDA_FORCE_MMQ
if (cc < CC_OFFSET_AMD) {
return cc < CC_VOLTA || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}
return cc < CC_RDNA3 || ne11 < MMQ_DP4A_MAX_BATCH_SIZE;
}

2610
ggml/src/ggml-cuda/mmq.cuh Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,7 @@
#include "common.cuh"
#define MMVQ_MAX_BATCH_SIZE 8 // Max. batch size for which to use MMVQ kernels.
void ggml_cuda_op_mul_mat_vec_q(
ggml_backend_cuda_context & ctx,
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,

Some files were not shown because too many files have changed in this diff Show More