Compare commits

..

1 Commits

Author SHA1 Message Date
ceb77363cd ggml : disable CUDA graphs for non-llama.cpp projects 2024-06-26 20:14:22 +03:00
288 changed files with 184949 additions and 54489 deletions

View File

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential git cmake libsdl2-dev apt-get install -y build-essential git cmake
WORKDIR /app WORKDIR /app

View File

@ -17,7 +17,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
ENV GGML_CUDA=1 ENV GGML_CUDA=1
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y build-essential libsdl2-dev \ apt-get install -y build-essential \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
# Ref: https://stackoverflow.com/a/53464012 # Ref: https://stackoverflow.com/a/53464012

View File

@ -12,7 +12,7 @@ FROM ubuntu:22.04 AS runtime
WORKDIR /app WORKDIR /app
RUN apt-get update && \ RUN apt-get update && \
apt-get install -y curl ffmpeg libsdl2-dev \ apt-get install -y curl ffmpeg \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/* && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
COPY --from=build /app /app COPY --from=build /app /app

View File

@ -13,10 +13,10 @@ jobs:
ubuntu-latest: ubuntu-latest:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/setup-go@v5 - uses: actions/setup-go@v3
with: with:
go-version: '^1.23' go-version: '^1.19'
- uses: actions/checkout@v4 - uses: actions/checkout@v1
- run: | - run: |
cd bindings/go cd bindings/go
make test make test

View File

@ -3,73 +3,20 @@ on:
push: push:
paths: paths:
- bindings/ruby/** - bindings/ruby/**
- src/whisper.cpp - whisper.h
- include/whisper.h
- ggml/src/ggml.c
- ggml/src/ggml-impl.h
- ggml/src/ggml-aarch64.h
- ggml/src/ggml-aarch64.c
- ggml/src/ggml-alloc.c
- ggml/src/ggml-backend-impl.h
- ggml/src/ggml-backend.cpp
- ggml/src/ggml-common.h
- ggml/src/ggml-quants.h
- ggml/src/ggml-quants.c
- ggml/src/ggml-cpu-impl.h
- ggml/src/ggml-metal.m
- ggml/src/ggml-metal.metal
- ggml/src/ggml-blas.cpp
- ggml/include/ggml.h
- ggml/include/ggml-alloc.h
- ggml/include/ggml-backend.h
- ggml/include/ggml-cuda.h
- ggml/include/ggml-kompute.h
- ggml/include/ggml-metal.h
- ggml/include/ggml-sycl.h
- ggml/include/ggml-vulkan.h
- ggml/include/ggml-blas.h
- scripts/get-flags.mk
- examples/dr_wav.h
pull_request: pull_request:
paths: paths:
- bindings/ruby/** - bindings/ruby/**
- src/whisper.cpp - whisper.h
- include/whisper.h
- ggml/src/ggml.c
- ggml/src/ggml-impl.h
- ggml/src/ggml-aarch64.h
- ggml/src/ggml-aarch64.c
- ggml/src/ggml-alloc.c
- ggml/src/ggml-backend-impl.h
- ggml/src/ggml-backend.cpp
- ggml/src/ggml-common.h
- ggml/src/ggml-quants.h
- ggml/src/ggml-quants.c
- ggml/src/ggml-cpu-impl.h
- ggml/src/ggml-metal.m
- ggml/src/ggml-metal.metal
- ggml/src/ggml-blas.cpp
- ggml/include/ggml.h
- ggml/include/ggml-alloc.h
- ggml/include/ggml-backend.h
- ggml/include/ggml-cuda.h
- ggml/include/ggml-kompute.h
- ggml/include/ggml-metal.h
- ggml/include/ggml-sycl.h
- ggml/include/ggml-vulkan.h
- ggml/include/ggml-blas.h
- scripts/get-flags.mk
- examples/dr_wav.h
jobs: jobs:
ubuntu-latest: ubuntu-latest:
runs-on: ubuntu-latest runs-on: ubuntu-latest
defaults:
run:
working-directory: bindings/ruby
steps: steps:
- uses: ruby/setup-ruby@v1 - uses: ruby/setup-ruby@v1
with: with:
ruby-version: '3.0' ruby-version: '3.0'
- uses: actions/checkout@v4 - uses: actions/checkout@v1
- run: rake test - run: |
cd bindings/ruby/ext
ruby extconf.rb && make

View File

@ -59,7 +59,7 @@ jobs:
uses: cross-platform-actions/action@v0.24.0 uses: cross-platform-actions/action@v0.24.0
with: with:
operating_system: freebsd operating_system: freebsd
version: '13.3' version: '13.2'
run: | run: |
sudo pkg update sudo pkg update
sudo pkg install -y gmake sdl2 sudo pkg install -y gmake sdl2
@ -586,75 +586,73 @@ jobs:
cd whisper/examples/whisper.android cd whisper/examples/whisper.android
./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML ./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML
# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602 android_java:
# android_java: runs-on: ubuntu-latest
# runs-on: ubuntu-latest
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: set up JDK 11
# uses: actions/setup-java@v4
# with:
# java-version: '11'
# distribution: 'temurin'
# cache: gradle
#
# - name: Setup Android SDK
# uses: android-actions/setup-android@v3
# with:
# cmdline-tools-version: 9.0
#
# - name: Build
# run: |
# cd examples/whisper.android.java
# chmod +x ./gradlew
# ./gradlew assembleRelease
# TODO: disabled because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/9686220096/job/26735899598 steps:
# java: - name: Clone
# needs: [ 'windows' ] uses: actions/checkout@v4
# runs-on: windows-latest
# steps: - name: set up JDK 11
# - uses: actions/checkout@v4 uses: actions/setup-java@v4
# with:
# - name: Install Java java-version: '11'
# uses: actions/setup-java@v4 distribution: 'temurin'
# with: cache: gradle
# distribution: zulu
# java-version: 20 - name: Setup Android SDK
# uses: android-actions/setup-android@v3
# - name: Download Windows lib with:
# uses: actions/download-artifact@v4 cmdline-tools-version: 9.0
# with:
# name: win32-x86-64_whisper.dll - name: Build
# path: bindings/java/build/generated/resources/main/win32-x86-64 run: |
# cd examples/whisper.android.java
# - name: Build chmod +x ./gradlew
# run: | ./gradlew assembleRelease
# models\download-ggml-model.cmd tiny.en
# cd bindings/java java:
# chmod +x ./gradlew needs: [ 'windows' ]
# ./gradlew build runs-on: windows-latest
# steps:
# - name: Upload jar - uses: actions/checkout@v4
# uses: actions/upload-artifact@v4
# with: - name: Install Java
# name: whispercpp.jar uses: actions/setup-java@v4
# path: bindings/java/build/libs/whispercpp-*.jar with:
# distribution: zulu
# - name: Publish package java-version: 20
# if: ${{ github.ref == 'refs/heads/master' }}
# uses: gradle/gradle-build-action@v2.4.2 - name: Download Windows lib
# with: uses: actions/download-artifact@v4
# arguments: publish with:
# build-root-directory: bindings/java name: win32-x86-64_whisper.dll
# env: path: bindings/java/build/generated/resources/main/win32-x86-64
# MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
# MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }} - name: Build
# PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }} run: |
# PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }} models\download-ggml-model.cmd tiny.en
cd bindings/java
chmod +x ./gradlew
./gradlew build
- name: Upload jar
uses: actions/upload-artifact@v4
with:
name: whispercpp.jar
path: bindings/java/build/libs/whispercpp-*.jar
- name: Publish package
if: ${{ github.ref == 'refs/heads/master' }}
uses: gradle/gradle-build-action@v2.4.2
with:
arguments: publish
build-root-directory: bindings/java
env:
MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
quantize: quantize:
runs-on: ubuntu-latest runs-on: ubuntu-latest

View File

@ -18,9 +18,7 @@ jobs:
matrix: matrix:
config: config:
- { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" } - { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
#TODO: the cuda image keeps failing - disable for now - { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
# https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
#- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
steps: steps:
- name: Check out the repo - name: Check out the repo

2
.gitignore vendored
View File

@ -3,13 +3,11 @@
.cache/ .cache/
.coreml/ .coreml/
.test/ .test/
.venv/
.vs/ .vs/
.vscode/ .vscode/
.DS_Store .DS_Store
.vimspector.json .vimspector.json
/CMakeSettings.json /CMakeSettings.json
/talk-llama.dSYM/
build/ build/
build-*/ build-*/

View File

@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories. cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
project("whisper.cpp" C CXX) project("whisper.cpp" C CXX)
project("whisper.cpp" VERSION 1.7.1) project("whisper.cpp" VERSION 1.6.2)
include(CheckIncludeFileCXX) include(CheckIncludeFileCXX)
set(SOVERSION 1) set(SOVERSION 1)
@ -120,10 +120,7 @@ whisper_option_depr(WARNING WHISPER_SYCL_F16 GGML_SYCL_F16)
# build the library # build the library
# #
if (NOT TARGET ggml) add_subdirectory(ggml)
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
add_subdirectory(src) add_subdirectory(src)
# #
@ -164,6 +161,18 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake ${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper) DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper)
install(
FILES convert-hf-to-gguf.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR})
configure_file(cmake/whisper.pc.in configure_file(cmake/whisper.pc.in
"${CMAKE_CURRENT_BINARY_DIR}/whisper.pc" "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
@ONLY) @ONLY)

102
Makefile
View File

@ -3,11 +3,12 @@ BUILD_TARGETS = \
main \ main \
bench \ bench \
quantize \ quantize \
server server \
tests/test-c.o
# Binaries only useful for tests # Binaries only useful for tests
TEST_TARGETS = \ TEST_TARGETS = \
tests/test-c.o tests/test-backend-ops
# Deprecation aliases # Deprecation aliases
ifdef WHISPER_CUBLAS ifdef WHISPER_CUBLAS
@ -134,18 +135,14 @@ ifdef GGML_RPC
BUILD_TARGETS += rpc-server BUILD_TARGETS += rpc-server
endif endif
ifdef GGML_VULKAN
BUILD_TARGETS += vulkan-shaders-gen
endif
ifeq ($(shell sdl2-config --cflags --libs 2>/dev/null),) ifeq ($(shell sdl2-config --cflags --libs 2>/dev/null),)
else else
BUILD_TARGETS += \ BUILD_TARGETS += \
command \ command \
stream \ stream \
lsp \ lsp \
talk \
talk-llama talk-llama
# talk (TODO: disalbed)
endif endif
default: $(BUILD_TARGETS) default: $(BUILD_TARGETS)
@ -255,9 +252,6 @@ ifdef WHISPER_DEBUG
endif endif
else else
MK_CPPFLAGS += -DNDEBUG MK_CPPFLAGS += -DNDEBUG
MK_CFLAGS += -O3
MK_CXXFLAGS += -O3
MK_NVCCFLAGS += -O3
endif endif
ifdef WHISPER_SANITIZE_THREAD ifdef WHISPER_SANITIZE_THREAD
@ -507,15 +501,16 @@ ifdef GGML_CUDA
CUDA_PATH ?= /usr/local/cuda CUDA_PATH ?= /usr/local/cuda
endif endif
#MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
#MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcufft -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcufft -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_NVCCFLAGS += -use_fast_math MK_NVCCFLAGS += -use_fast_math
OBJ_GGML += ggml/src/ggml-cuda.o OBJ_GGML += ggml/src/ggml-cuda.o
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu)) OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
OBJ_GGML += $(OBJ_CUDA_TMPL) OBJ_GGML += $(OBJ_CUDA_TMPL)
OBJ_WHISPER += src/whisper-mel-cuda.o
ifdef WHISPER_FATAL_WARNINGS ifdef WHISPER_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings MK_NVCCFLAGS += -Werror all-warnings
endif # WHISPER_FATAL_WARNINGS endif # WHISPER_FATAL_WARNINGS
@ -624,12 +619,16 @@ ggml/src/ggml-cuda.o: \
ggml/src/ggml-common.h \ ggml/src/ggml-common.h \
$(wildcard ggml/src/ggml-cuda/*.cuh) $(wildcard ggml/src/ggml-cuda/*.cuh)
$(NVCC_COMPILE) $(NVCC_COMPILE)
src/whisper-mel-cuda.o: src/whisper-mel-cuda.cu src/whisper-mel-cuda.hpp
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endif # GGML_CUDA endif # GGML_CUDA
ifdef GGML_VULKAN ifdef GGML_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += $(shell pkg-config --libs vulkan) MK_LDFLAGS += -lvulkan
OBJ_GGML += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o OBJ_GGML += ggml/src/ggml-vulkan.o
ifdef GGML_VULKAN_CHECK_RESULTS ifdef GGML_VULKAN_CHECK_RESULTS
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
@ -643,10 +642,6 @@ ifdef GGML_VULKAN_MEMORY_DEBUG
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
endif endif
ifdef GGML_VULKAN_PERF
MK_CPPFLAGS += -DGGML_VULKAN_PERF
endif
ifdef GGML_VULKAN_VALIDATE ifdef GGML_VULKAN_VALIDATE
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
endif endif
@ -655,28 +650,10 @@ ifdef GGML_VULKAN_RUN_TESTS
MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS
endif endif
GLSLC_CMD = glslc ggml/src/ggml-vulkan.o: \
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen ggml/src/ggml-vulkan.cpp \
_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp ggml/include/ggml-vulkan.h
_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp $(CXX) $(CXXFLAGS) -c $< -o $@
_ggml_vk_input_dir = ggml/src/vulkan-shaders
_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
$(_ggml_vk_header): $(_ggml_vk_source)
$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
$(_ggml_vk_genshaders_cmd) \
--glslc $(GLSLC_CMD) \
--input-dir $(_ggml_vk_input_dir) \
--target-hpp $(_ggml_vk_header) \
--target-cpp $(_ggml_vk_source)
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
endif # GGML_VULKAN endif # GGML_VULKAN
ifdef GGML_HIPBLAS ifdef GGML_HIPBLAS
@ -803,8 +780,7 @@ OBJ_GGML += \
ggml/src/ggml.o \ ggml/src/ggml.o \
ggml/src/ggml-alloc.o \ ggml/src/ggml-alloc.o \
ggml/src/ggml-backend.o \ ggml/src/ggml-backend.o \
ggml/src/ggml-quants.o \ ggml/src/ggml-quants.o
ggml/src/ggml-aarch64.o
OBJ_WHISPER += \ OBJ_WHISPER += \
src/whisper.o src/whisper.o
@ -923,10 +899,10 @@ ggml/src/ggml-alloc.o: \
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-backend.o: \ ggml/src/ggml-backend.o: \
ggml/src/ggml-backend.cpp \ ggml/src/ggml-backend.c \
ggml/include/ggml.h \ ggml/include/ggml.h \
ggml/include/ggml-backend.h ggml/include/ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-quants.o: \ ggml/src/ggml-quants.o: \
ggml/src/ggml-quants.c \ ggml/src/ggml-quants.c \
@ -935,13 +911,6 @@ ggml/src/ggml-quants.o: \
ggml/src/ggml-common.h ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-aarch64.o: \
ggml/src/ggml-aarch64.c \
ggml/include/ggml.h \
ggml/src/ggml-aarch64.h \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-blas.o: \ ggml/src/ggml-blas.o: \
ggml/src/ggml-blas.cpp \ ggml/src/ggml-blas.cpp \
ggml/include/ggml-blas.h ggml/include/ggml-blas.h
@ -974,6 +943,7 @@ $(LIB_GGML_S): \
src/whisper.o: \ src/whisper.o: \
src/whisper.cpp \ src/whisper.cpp \
src/whisper-mel.hpp \
include/whisper.h \ include/whisper.h \
ggml/include/ggml.h \ ggml/include/ggml.h \
ggml/include/ggml-alloc.h \ ggml/include/ggml-alloc.h \
@ -988,8 +958,7 @@ $(LIB_WHISPER): \
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
$(LIB_WHISPER_S): \ $(LIB_WHISPER_S): \
$(OBJ_WHISPER) \ $(OBJ_WHISPER)
$(OBJ_GGML)
ar rcs $(LIB_WHISPER_S) $^ ar rcs $(LIB_WHISPER_S) $^
# common # common
@ -1066,6 +1035,9 @@ main: examples/main/main.cpp \
$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./llama-cli -h for help. ===='
@echo
bench: examples/bench/bench.cpp \ bench: examples/bench/bench.cpp \
$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON)
@ -1097,14 +1069,12 @@ lsp: examples/lsp/lsp.cpp \
$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
# TODO: disabled until update talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp \
# https://github.com/ggerganov/whisper.cpp/issues/1818 $(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
#talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp \ $(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
# $(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
# $(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
# $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/llama-vocab.cpp examples/talk-llama/llama-grammar.cpp examples/talk-llama/llama-sampling.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp \ talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp \
$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL) $(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
@ -1118,6 +1088,11 @@ tests: $(TEST_TARGETS)
tests/test-c.o: tests/test-c.c include/whisper.h tests/test-c.o: tests/test-c.c include/whisper.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@ $(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
tests/test-backend-ops: tests/test-backend-ops.cpp \
$(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
# #
# Audio samples # Audio samples
# #
@ -1163,9 +1138,8 @@ samples:
.PHONY: large-v1 .PHONY: large-v1
.PHONY: large-v2 .PHONY: large-v2
.PHONY: large-v3 .PHONY: large-v3
.PHONY: large-v3-turbo
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3 large-v3-turbo: main tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3: main
bash ./models/download-ggml-model.sh $@ bash ./models/download-ggml-model.sh $@
@echo "" @echo ""
@echo "===============================================" @echo "==============================================="

View File

@ -32,9 +32,8 @@ let package = Package(
sources: [ sources: [
"ggml/src/ggml.c", "ggml/src/ggml.c",
"src/whisper.cpp", "src/whisper.cpp",
"ggml/src/ggml-aarch64.c",
"ggml/src/ggml-alloc.c", "ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.cpp", "ggml/src/ggml-backend.c",
"ggml/src/ggml-quants.c", "ggml/src/ggml-quants.c",
"ggml/src/ggml-metal.m" "ggml/src/ggml-metal.m"
], ],

111
README.md
View File

@ -7,23 +7,21 @@
[![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp) [![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/) [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
Stable: [v1.7.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126) Stable: [v1.6.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.6.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model: High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
- Plain C/C++ implementation without dependencies - Plain C/C++ implementation without dependencies
- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](#core-ml-support) - Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
- AVX intrinsics support for x86 architectures - AVX intrinsics support for x86 architectures
- VSX intrinsics support for POWER architectures - VSX intrinsics support for POWER architectures
- Mixed F16 / F32 precision - Mixed F16 / F32 precision
- [4-bit and 5-bit integer quantization support](#quantization) - [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
- Zero memory allocations at runtime - Zero memory allocations at runtime
- [Vulkan support](#vulkan-gpu-support)
- Support for CPU-only inference - Support for CPU-only inference
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support) - [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
- [OpenVINO Support](#openvino-support) - [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
- [Ascend NPU Support](#ascend-npu-support) - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
Supported platforms: Supported platforms:
@ -35,9 +33,9 @@ Supported platforms:
- [x] [WebAssembly](examples/whisper.wasm) - [x] [WebAssembly](examples/whisper.wasm)
- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)] - [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166) - [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp) - [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp). The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library. The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications. Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
@ -57,8 +55,8 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
## Implementation details ## Implementation details
- The core tensor operations are implemented in C ([ggml.h](ggml/include/ggml.h) / [ggml.c](ggml/src/ggml.c)) - The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](include/whisper.h) / [whisper.cpp](src/whisper.cpp)) - The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
- Sample usage is demonstrated in [main.cpp](examples/main) - Sample usage is demonstrated in [main.cpp](examples/main)
- Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream) - Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
- Various other examples are available in the [examples](examples) folder - Various other examples are available in the [examples](examples) folder
@ -73,23 +71,17 @@ First clone the repository:
git clone https://github.com/ggerganov/whisper.cpp.git git clone https://github.com/ggerganov/whisper.cpp.git
``` ```
Navigate into the directory:
```
cd whisper.cpp
```
Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example: Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:
```bash ```bash
sh ./models/download-ggml-model.sh base.en bash ./models/download-ggml-model.sh base.en
``` ```
Now build the [main](examples/main) example and transcribe an audio file like this: Now build the [main](examples/main) example and transcribe an audio file like this:
```bash ```bash
# build the main example # build the main example
make -j make
# transcribe an audio file # transcribe an audio file
./main -f samples/jfk.wav ./main -f samples/jfk.wav
@ -100,7 +92,7 @@ make -j
For a quick demo, simply run `make base.en`: For a quick demo, simply run `make base.en`:
```text ```text
$ make -j base.en $ make base.en
cc -I. -O3 -std=c11 -pthread -DGGML_USE_ACCELERATE -c ggml.c -o ggml.o cc -I. -O3 -std=c11 -pthread -DGGML_USE_ACCELERATE -c ggml.c -o ggml.o
c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o
@ -153,7 +145,7 @@ options:
-ng, --no-gpu [false ] disable GPU -ng, --no-gpu [false ] disable GPU
sh ./models/download-ggml-model.sh base.en bash ./models/download-ggml-model.sh base.en
Downloading ggml model base.en ... Downloading ggml model base.en ...
ggml-base.en.bin 100%[========================>] 141.11M 6.34MB/s in 24s ggml-base.en.bin 100%[========================>] 141.11M 6.34MB/s in 24s
Done! Model 'base.en' saved in 'models/ggml-base.en.bin' Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
@ -224,7 +216,7 @@ ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
If you want some extra audio samples to play with, simply run: If you want some extra audio samples to play with, simply run:
``` ```
make -j samples make samples
``` ```
This will download a few more audio files from Wikipedia and convert them to 16-bit WAV format via `ffmpeg`. This will download a few more audio files from Wikipedia and convert them to 16-bit WAV format via `ffmpeg`.
@ -232,18 +224,17 @@ This will download a few more audio files from Wikipedia and convert them to 16-
You can download and run the other models as follows: You can download and run the other models as follows:
``` ```
make -j tiny.en make tiny.en
make -j tiny make tiny
make -j base.en make base.en
make -j base make base
make -j small.en make small.en
make -j small make small
make -j medium.en make medium.en
make -j medium make medium
make -j large-v1 make large-v1
make -j large-v2 make large-v2
make -j large-v3 make large-v3
make -j large-v3-turbo
``` ```
## Memory usage ## Memory usage
@ -265,7 +256,7 @@ Here are the steps for creating and using a quantized model:
```bash ```bash
# quantize a model with Q5_0 method # quantize a model with Q5_0 method
make -j quantize make quantize
./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0 ./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
# run the examples as usual, specifying the quantized model file # run the examples as usual, specifying the quantized model file
@ -430,16 +421,6 @@ make clean
GGML_CUDA=1 make -j GGML_CUDA=1 make -j
``` ```
## Vulkan GPU support
Cross-vendor solution which allows you to accelerate workload on your GPU.
First, make sure your graphics card driver provides support for Vulkan API.
Now build `whisper.cpp` with Vulkan support:
```
make clean
make GGML_VULKAN=1 -j
```
## BLAS CPU support via OpenBLAS ## BLAS CPU support via OpenBLAS
Encoder processing can be accelerated on the CPU via OpenBLAS. Encoder processing can be accelerated on the CPU via OpenBLAS.
@ -467,39 +448,6 @@ cmake -DWHISPER_MKL=ON ..
WHISPER_MKL=1 make -j WHISPER_MKL=1 make -j
``` ```
## Ascend NPU support
Ascend NPU provides inference acceleration via [`CANN`](https://www.hiascend.com/en/software/cann) and AI cores.
First, check if your Ascend NPU device is supported:
**Verified devices**
| Ascend NPU | Status |
|:-----------------------------:|:-------:|
| Atlas 300T A2 | Support |
Then, make sure you have installed [`CANN toolkit`](https://www.hiascend.com/en/software/cann/community) . The lasted version of CANN is recommanded.
Now build `whisper.cpp` with CANN support:
```
mkdir build
cd build
cmake .. -D GGML_CANN=on
make -j
```
Run the inference examples as usual, for example:
```
./build/bin/main -f samples/jfk.wav -m models/ggml-base.en.bin -t 8
```
*Notes:*
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
- If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
## Docker ## Docker
### Prerequisites ### Prerequisites
@ -636,7 +584,7 @@ The [stream](examples/stream) tool samples the audio every half a second and run
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10). More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
```bash ```bash
make stream -j make stream
./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000 ./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
``` ```
@ -803,7 +751,7 @@ took to execute it. The results are summarized in the following Github issue:
[Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89) [Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py). Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](bench.py).
You can run it with the following command, by default it will run against any standard model in the models folder. You can run it with the following command, by default it will run against any standard model in the models folder.
@ -850,7 +798,6 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython) - [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
- [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp) - [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11) - [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [abdeladim-s/pywhispercpp](https://github.com/abdeladim-s/pywhispercpp) (Pybind11)
- [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper) - [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
- [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity) - [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)

View File

@ -14,14 +14,9 @@ GGML_METAL_PATH_RESOURCES := $(abspath ../..)
BUILD_DIR := build BUILD_DIR := build
MODELS_DIR := models MODELS_DIR := models
EXAMPLES_DIR := $(wildcard examples/*) EXAMPLES_DIR := $(wildcard examples/*)
INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include) INCLUDE_PATH := $(abspath ../..)
LIBRARY_PATH := $(abspath ../..) LIBRARY_PATH := $(abspath ../..)
ifeq ($(GGML_CUDA),1)
LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
BUILD_FLAGS := -ldflags "-extldflags '-lcudart -lcuda -lcublas'"
endif
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
endif endif

View File

@ -62,12 +62,6 @@ This will compile a static `libwhisper.a` in a `build` folder, download a model
make examples make examples
``` ```
To build using cuda support add `GGML_CUDA=1`:
```bash
GGML_CUDA=1 make examples
```
The examples are placed in the `build` directory. Once built, you can download all the models with the following command: The examples are placed in the `build` directory. Once built, you can download all the models with the following command:
```bash ```bash

View File

@ -24,7 +24,7 @@ const (
var ( var (
// The models which will be downloaded, if no model is specified as an argument // The models which will be downloaded, if no model is specified as an argument
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3", "large-v3-turbo"} modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
) )
var ( var (

View File

@ -1,10 +1,10 @@
module github.com/ggerganov/whisper.cpp/bindings/go module github.com/ggerganov/whisper.cpp/bindings/go
go 1.23 go 1.19
require ( require (
github.com/go-audio/wav v1.1.0 github.com/go-audio/wav v1.1.0
github.com/stretchr/testify v1.9.0 github.com/stretchr/testify v1.8.1
) )
require ( require (

View File

@ -1,3 +1,4 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
@ -8,9 +9,15 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE= github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@ -119,28 +119,6 @@ func (p *Params) SetAudioCtx(n int) {
p.audio_ctx = C.int(n) p.audio_ctx = C.int(n)
} }
func (p *Params) SetMaxContext(n int) {
p.n_max_text_ctx = C.int(n)
}
func (p *Params) SetBeamSize(n int) {
p.beam_search.beam_size = C.int(n)
}
func (p *Params) SetEntropyThold(t float32) {
p.entropy_thold = C.float(t)
}
func (p *Params) SetTemperature(t float32) {
p.temperature = C.float(t)
}
// Sets the fallback temperature incrementation
// Pass -1.0 to disable this feature
func (p *Params) SetTemperatureFallback(t float32) {
p.temperature_inc = C.float(t)
}
// Set initial prompt // Set initial prompt
func (p *Params) SetInitialPrompt(prompt string) { func (p *Params) SetInitialPrompt(prompt string) {
p.initial_prompt = C.CString(prompt) p.initial_prompt = C.CString(prompt)
@ -171,10 +149,6 @@ func (p *Params) String() string {
str += fmt.Sprintf(" duration_ms=%d", p.duration_ms) str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx) str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt)) str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
str += fmt.Sprintf(" entropy_thold=%f", p.entropy_thold)
str += fmt.Sprintf(" temperature=%f", p.temperature)
str += fmt.Sprintf(" temperature_inc=%f", p.temperature_inc)
str += fmt.Sprintf(" beam_size=%d", p.beam_search.beam_size)
if p.translate { if p.translate {
str += " translate" str += " translate"
} }

View File

@ -125,32 +125,6 @@ func (context *context) SetAudioCtx(n uint) {
context.params.SetAudioCtx(int(n)) context.params.SetAudioCtx(int(n))
} }
// Set maximum number of text context tokens to store
func (context *context) SetMaxContext(n int) {
context.params.SetMaxContext(n)
}
// Set Beam Size
func (context *context) SetBeamSize(n int) {
context.params.SetBeamSize(n)
}
// Set Entropy threshold
func (context *context) SetEntropyThold(t float32) {
context.params.SetEntropyThold(t)
}
// Set Temperature
func (context *context) SetTemperature(t float32) {
context.params.SetTemperature(t)
}
// Set the fallback temperature incrementation
// Pass -1.0 to disable this feature
func (context *context) SetTemperatureFallback(t float32) {
context.params.SetTemperatureFallback(t)
}
// Set initial prompt // Set initial prompt
func (context *context) SetInitialPrompt(prompt string) { func (context *context) SetInitialPrompt(prompt string) {
context.params.SetInitialPrompt(prompt) context.params.SetInitialPrompt(prompt)

View File

@ -4,90 +4,52 @@ import (
"os" "os"
"testing" "testing"
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper" // Packages
"github.com/go-audio/wav" whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
assert "github.com/stretchr/testify/assert" assert "github.com/stretchr/testify/assert"
) )
func TestSetLanguage(t *testing.T) { const (
assert := assert.New(t) ModelPath = "../../models/ggml-tiny.bin"
SamplePath = "../../samples/jfk.wav"
)
func Test_Whisper_000(t *testing.T) {
assert := assert.New(t)
if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
t.Skip("Skipping test, model not found:", ModelPath)
}
if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
t.Skip("Skipping test, sample not found:", SamplePath)
}
// Load model
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
assert.NoError(model.Close())
t.Log("languages=", model.Languages())
}
func Test_Whisper_001(t *testing.T) {
assert := assert.New(t)
if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
t.Skip("Skipping test, model not found:", ModelPath)
}
if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
t.Skip("Skipping test, sample not found:", SamplePath)
}
// Load model
model, err := whisper.New(ModelPath) model, err := whisper.New(ModelPath)
assert.NoError(err) assert.NoError(err)
assert.NotNil(model) assert.NotNil(model)
defer model.Close() defer model.Close()
context, err := model.NewContext() // Get context for decoding
ctx, err := model.NewContext()
assert.NoError(err) assert.NoError(err)
assert.NotNil(ctx)
// This returns an error since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
err = context.SetLanguage("en")
assert.Error(err)
}
func TestContextModelIsMultilingual(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
isMultilingual := context.IsMultilingual()
// This returns false since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
assert.False(isMultilingual)
}
func TestLanguage(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
// This always returns en since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
expectedLanguage := "en"
actualLanguage := context.Language()
assert.Equal(expectedLanguage, actualLanguage)
}
func TestProcess(t *testing.T) {
assert := assert.New(t)
fh, err := os.Open(SamplePath)
assert.NoError(err)
defer fh.Close()
// Decode the WAV file - load the full buffer
dec := wav.NewDecoder(fh)
buf, err := dec.FullPCMBuffer()
assert.NoError(err)
assert.Equal(uint16(1), dec.NumChans)
data := buf.AsFloat32Buffer().Data
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
err = context.Process(data, nil, nil)
assert.NoError(err)
} }

View File

@ -48,12 +48,7 @@ type Context interface {
SetTokenTimestamps(bool) // Set token timestamps flag SetTokenTimestamps(bool) // Set token timestamps flag
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit) SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
SetAudioCtx(uint) // Set audio encoder context SetAudioCtx(uint) // Set audio encoder context
SetMaxContext(n int) // Set maximum number of text context tokens to store
SetBeamSize(n int) // Set Beam Size
SetEntropyThold(t float32) // Set Entropy threshold
SetInitialPrompt(prompt string) // Set initial prompt SetInitialPrompt(prompt string) // Set initial prompt
SetTemperature(t float32) // Set temperature
SetTemperatureFallback(t float32) // Set temperature incrementation
// Process mono audio data and return any errors. // Process mono audio data and return any errors.
// If defined, newly generated segments are passed to the // If defined, newly generated segments are passed to the

View File

@ -1,91 +0,0 @@
package whisper_test
import (
"testing"
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
assert "github.com/stretchr/testify/assert"
)
func TestNew(t *testing.T) {
assert := assert.New(t)
t.Run("valid model path", func(t *testing.T) {
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
})
t.Run("invalid model path", func(t *testing.T) {
invalidModelPath := "invalid-model-path.bin"
model, err := whisper.New(invalidModelPath)
assert.Error(err)
assert.Nil(model)
})
}
func TestClose(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
err = model.Close()
assert.NoError(err)
}
func TestNewContext(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
assert.NotNil(context)
}
func TestIsMultilingual(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
isMultilingual := model.IsMultilingual()
// This returns false since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
assert.False(isMultilingual)
}
func TestLanguages(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
expectedLanguages := []string{
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl",
"ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk",
"el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr",
"bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn",
"sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne",
"mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn",
"yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi",
"lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my",
"bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su",
}
actualLanguages := model.Languages()
assert.Equal(expectedLanguages, actualLanguages)
}

View File

@ -1,6 +0,0 @@
package whisper_test
const (
ModelPath = "../../models/ggml-small.en.bin"
SamplePath = "../../samples/jfk.wav"
)

View File

@ -9,7 +9,7 @@ import (
// CGO // CGO
/* /*
#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp #cgo LDFLAGS: -lwhisper -lm -lstdc++
#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics #cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
#include <whisper.h> #include <whisper.h>
#include <stdlib.h> #include <stdlib.h>

View File

@ -1,6 +1,6 @@
{ {
"name": "whisper.cpp", "name": "whisper.cpp",
"version": "1.7.1", "version": "1.6.2",
"description": "Whisper speech recognition", "description": "Whisper speech recognition",
"main": "whisper.js", "main": "whisper.js",
"scripts": { "scripts": {

View File

@ -1,3 +0,0 @@
LICENSE
pkg/
lib/whisper.*

View File

@ -1,111 +0,0 @@
whispercpp
==========
![whisper.cpp](https://user-images.githubusercontent.com/1991296/235238348-05d0f6a4-da44-4900-a1de-d0707e75b763.jpeg)
Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model.
Installation
------------
Install the gem and add to the application's Gemfile by executing:
$ bundle add whispercpp
If bundler is not being used to manage dependencies, install the gem by executing:
$ gem install whispercpp
Usage
-----
```ruby
require "whisper"
whisper = Whisper::Context.new("path/to/model.bin")
params = Whisper::Params.new
params.language = "en"
params.offset = 10_000
params.duration = 60_000
params.max_text_tokens = 300
params.translate = true
params.print_timestamps = false
params.prompt = "Initial prompt here."
whisper.transcribe("path/to/audio.wav", params) do |whole_text|
puts whole_text
end
```
### Preparing model ###
Use script to download model file(s):
```bash
git clone https://github.com/ggerganov/whisper.cpp.git
cd whisper.cpp
sh ./models/download-ggml-model.sh base.en
```
There are some types of models. See [models][] page for details.
### Preparing audio file ###
Currently, whisper.cpp accepts only 16-bit WAV files.
### API ###
Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
```ruby
def format_time(time_ms)
sec, decimal_part = time_ms.divmod(1000)
min, sec = sec.divmod(60)
hour, min = min.divmod(60)
"%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
end
whisper.transcribe("path/to/audio.wav", params)
whisper.each_segment.with_index do |segment, index|
line = "[%{nth}: %{st} --> %{ed}] %{text}" % {
nth: index + 1,
st: format_time(segment.start_time),
ed: format_time(segment.end_time),
text: segment.text
}
line << " (speaker turned)" if segment.speaker_next_turn?
puts line
end
```
You can also add hook to params called on new segment:
```ruby
def format_time(time_ms)
sec, decimal_part = time_ms.divmod(1000)
min, sec = sec.divmod(60)
hour, min = min.divmod(60)
"%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
end
# Add hook before calling #transcribe
params.on_new_segment do |segment|
line = "[%{st} --> %{ed}] %{text}" % {
st: format_time(segment.start_time),
ed: format_time(segment.end_time),
text: segment.text
}
line << " (speaker turned)" if segment.speaker_next_turn?
puts line
end
whisper.transcribe("path/to/audio.wav", params)
```
[whisper.cpp]: https://github.com/ggerganov/whisper.cpp
[models]: https://github.com/ggerganov/whisper.cpp/tree/master/models

View File

@ -1,59 +1,12 @@
require 'rake/clean' require 'rake/clean'
require "bundler/gem_tasks" require 'rubygems/package'
require "pathname"
require "yaml"
require "rake/testtask"
extsources = YAML.load_file("extsources.yaml") desc 'Build gem'
SOURCES = FileList[] task :package do
extsources.each do |src| spec_source = File.read File.join(File.dirname(__FILE__),'whispercpp.gemspec')
basename = src.pathmap("%f") spec = nil
dest = basename == "LICENSE" ? basename : basename.pathmap("ext/%f") # see: http://gist.github.com/16215
file src Thread.new { spec = eval("#{spec_source}") }.join
file dest => src do |t| spec.validate
cp t.source, t.name Gem::Package.build(spec)
end
SOURCES.include dest
end
CLEAN.include SOURCES
CLEAN.include FileList[
"ext/*.o",
"ext/*.metal",
"ext/whisper.{so,bundle,dll}",
"ext/depend"
]
task build: SOURCES + FileList[
"ext/extconf.rb",
"ext/ruby_whisper.h",
"ext/ruby_whisper.cpp",
"whispercpp.gemspec",
]
directory "pkg"
CLOBBER.include "pkg"
TEST_MODEL = "../../models/ggml-base.en.bin"
LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
LIB_FILE = File.join("lib", LIB_NAME)
directory "lib"
task LIB_FILE => SOURCES + ["lib"] do |t|
Dir.chdir "ext" do
sh "ruby extconf.rb"
sh "make"
end
mv "ext/#{LIB_NAME}", t.name
end
CLEAN.include LIB_FILE
Rake::TestTask.new do |t|
t.test_files = FileList["tests/test_*.rb"]
end
task test: [TEST_MODEL, LIB_FILE]
file TEST_MODEL do
Dir.chdir "../.." do
sh "./models/download-ggml-model.sh base.en"
end
end end

View File

@ -3,33 +3,7 @@ ggml.c
ggml.h ggml.h
ggml-alloc.c ggml-alloc.c
ggml-alloc.h ggml-alloc.h
ggml-aarch64.c whisper.bundle
ggml-aarch64.h
ggml-backend.cpp
ggml-backend-impl.h
ggml-backend.c
ggml-backend.h
ggml-common.h
ggml-cpu-impl.h
ggml-metal.m
ggml-metal.metal
ggml-metal-embed.metal
ggml-blas.cpp
ggml-cuda.h
ggml-impl.h
ggml-kompute.h
ggml-metal.h
ggml-opencl.h
ggml-quants.c
ggml-quants.h
ggml-sycl.h
ggml-vulkan.h
ggml-blas.h
get-flags.mk
whisper.cpp whisper.cpp
whisper.h whisper.h
dr_wav.h dr_wav.h
depend
whisper.bundle
whisper.so
whisper.dll

View File

@ -1,4 +1,20 @@
require 'mkmf' require 'mkmf'
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper-mel.hpp')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
# need to use c++ compiler flags # need to use c++ compiler flags
$CXXFLAGS << ' -std=c++11' $CXXFLAGS << ' -std=c++11'
@ -12,219 +28,4 @@ if enable_config('march-tune-native', false)
$CXXFLAGS << ' -march=native -mtune=native' $CXXFLAGS << ' -march=native -mtune=native'
end end
def with_disabling_unsupported_files create_makefile('whisper')
disabled_files = []
unless $GGML_METAL
disabled_files << 'ggml-metal.h' << 'ggml-metal.m'
end
unless $GGML_METAL_EMBED_LIBRARY
disabled_files << 'ggml-metal.metal'
end
unless $OBJ_ALL&.include? 'ggml-blas.o'
disabled_files << 'ggml-blas.h' << 'ggml-blas.cpp'
end
disabled_files.filter! {|file| File.exist? file}
disabled_files.each do |file|
File.rename file, "#{file}.disabled"
end
yield
disabled_files.each do |file|
File.rename "#{file}.disabled", file
end
end
if ENV['WHISPER_METAL']
$GGML_METAL ||= true
$DEPRECATE_WARNING ||= true
end
$UNAME_S = `uname -s`.chomp
$UNAME_P = `uname -p`.chomp
$UNAME_M = `uname -m`.chomp
if $UNAME_S == 'Darwin'
unless ENV['GGML_NO_METAL']
$GGML_METAL ||= true
end
$GGML_NO_OPENMP ||= true
end
if $GGML_METAL
$GGML_METAL_EMBED_LIBRARY = true
end
$MK_CPPFLAGS = ''
$MK_CFLAGS = '-std=c11 -fPIC'
$MK_CXXFLAGS = '-std=c++11 -fPIC'
$MK_NVCCFLAGS = '-std=c++11'
$MK_LDFLAGS = ''
$OBJ_GGML = ''
$OBJ_WHISPER = ''
$OBJ_COMMON = ''
$OBJ_SDL = ''
$MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
if $UNAME_S == 'Linux'
$MK_CPPFLAGS << ' -D_GNU_SOURCE'
end
if $UNAME_S == 'Darwin'
$MK_CPPFLAGS << ' -D_DARWIN_C_SOURCE'
end
if ENV['WHISPER_DEBUG']
$MK_CFLAGS << ' -O0 -g'
$MK_CXXFLAGS << ' -O0 -g'
$MK_LDFLAGS << ' -g'
$MK_NVCCFLAGS << ' -O0 -g'
else
$MK_CPPFLAGS << ' -DNDEBUG'
$MK_CFLAGS << ' -O3'
$MK_CXXFLAGS << ' -O3'
$MK_NVCCFLAGS << ' -O3'
end
$WARN_FLAGS =
' -Wall' <<
' -Wextra' <<
' -Wpedantic' <<
' -Wcast-qual' <<
' -Wno-unused-function'
$MK_CFLAGS <<
$WARN_FLAGS <<
' -Wshadow' <<
' -Wstrict-prototypes' <<
' -Wpointer-arith' <<
' -Wmissing-prototypes' <<
' -Werror=implicit-int' <<
' -Werror=implicit-function-declaration'
$MK_CXXFLAGS <<
$WARN_FLAGS <<
' -Wmissing-declarations' <<
' -Wmissing-noreturn'
unless `#{cc_command} #{$LDFLAGS} -Wl,-v 2>&1`.chomp.include? 'dyld-1015.7'
$MK_CPPFLAGS << ' -DHAVE_BUGGY_APPLE_LINKER'
end
if %w[Linux Darwin FreeBSD NetBSD OpenBSD Haiku].include? $UNAME_S
$MK_CFLAGS << ' -pthread'
$MK_CXXFLAGS << ' -pthread'
end
unless $_WIN32
$DSO_EXT = '.so'
else
$DSO_EXT = '.dll'
end
unless ENV['RISCV']
if %w[x86_64 i686 amd64].include? $UNAME_M
$HOST_CXXFLAGS ||= ''
$MK_CFLAGS << ' -march=native -mtune=native'
$HOST_CXXFLAGS << ' -march=native -mtune=native'
end
if $UNAME_M.match? /aarch64.*/
$MK_CFLAGS << ' -mcpu=native'
$MK_CXXFLAGS << ' -mcpu=native'
end
else
$MK_CFLAGS << ' -march=rv64gcv -mabi=lp64d'
$MK_CXXFLAGS << ' -march=rv64gcv -mabi=lp64d'
end
unless ENV['GGML_NO_ACCELERATE']
if $UNAME_S == 'Darwin'
$MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS'
$MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
$MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
$MK_LDFLAGS << ' -framework Accelerate'
$OBJ_GGML << ' ggml-blas.o'
end
end
if ENV['GGML_OPENBLAS']
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
$MK_LDFLAGS << " #{`pkg-config --libs openblas`}"
$OBJ_GGML << ' ggml-blas.o'
end
if ENV['GGML_OPENBLAS64']
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
$MK_LDFLAGS << " #{`pkg-config --libs openblas64`}"
$OBJ_GGML << ' ggml-blas.o'
end
if $GGML_METAL
$MK_CPPFLAGS << ' -DGGML_USE_METAL'
$MK_LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
$OBJ_GGML << ' ggml-metal.o'
if ENV['GGML_METAL_NDEBUG']
$MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
end
if $GGML_METAL_EMBED_LIBRARY
$MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
$OBJ_GGML << ' ggml-metal-embed.o'
end
end
$OBJ_GGML <<
' ggml.o' <<
' ggml-alloc.o' <<
' ggml-backend.o' <<
' ggml-quants.o' <<
' ggml-aarch64.o'
$OBJ_WHISPER <<
' whisper.o'
$OBJ_ALL = "#{$OBJ_GGML} #{$OBJ_WHISPER} #{$OBJ_COMMON} #{$OBJ_SDL}"
$CPPFLAGS = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
$CFLAGS = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
$BASE_CXXFLAGS = "#{$MK_CXXFLAGS} #{$CXXFLAGS}"
$CXXFLAGS = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
$NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
$LDFLAGS = "#{$MK_LDFLAGS} #{$LDFLAGS}"
if $GGML_METAL_EMBED_LIBRARY
File.write 'depend', "$(OBJS): $(OBJS) ggml-metal-embed.o\n"
end
with_disabling_unsupported_files do
create_makefile('whisper')
end
File.open 'Makefile', 'a' do |file|
file.puts 'include get-flags.mk'
if $GGML_METAL
if $GGML_METAL_EMBED_LIBRARY
# mkmf determines object files to compile dependent on existing *.{c,cpp,m} files
# but ggml-metal-embed.c doesn't exist on creating Makefile.
file.puts "objs := $(OBJS)"
file.puts "OBJS = $(objs) 'ggml-metal-embed.o'"
file.puts 'include metal-embed.mk'
end
end
end

View File

@ -0,0 +1,141 @@
#pragma once
// ggml-backend internal header
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
//
// Backend buffer
//
// buffer type
typedef void * ggml_backend_buffer_type_context_t;
struct ggml_backend_buffer_type_i {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
};
struct ggml_backend_buffer_type {
struct ggml_backend_buffer_type_i iface;
ggml_backend_buffer_type_context_t context;
};
// buffer
typedef void * ggml_backend_buffer_context_t;
struct ggml_backend_buffer_i {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
};
struct ggml_backend_buffer {
struct ggml_backend_buffer_i iface;
ggml_backend_buffer_type_t buft;
ggml_backend_buffer_context_t context;
size_t size;
enum ggml_backend_buffer_usage usage;
};
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context,
size_t size);
// do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
// buffer that contains a collection of buffers
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
//
// Backend
//
typedef void * ggml_backend_context_t;
struct ggml_backend_i {
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
void (*GGML_CALL free)(ggml_backend_t backend);
// buffer allocation
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
// (optional) asynchronous tensor data access
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations
void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan (not used currently)
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph with a plan
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph without a plan (async)
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event);
void (*GGML_CALL event_record) (ggml_backend_event_t event);
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
};
struct ggml_backend {
ggml_guid_t guid;
struct ggml_backend_i iface;
ggml_backend_context_t context;
};
struct ggml_backend_event {
ggml_backend_t backend;
void * context;
};
//
// Backend registry
//
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,233 @@
#pragma once
#include "ggml.h"
#include "ggml-alloc.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend_event * ggml_backend_event_t;
typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t;
//
// Backend buffer
//
// buffer type
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer
enum ggml_backend_buffer_usage {
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
};
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
//
// Backend
//
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
GGML_API void ggml_backend_free(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// asynchronous copy
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
// events
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
//
// CPU backend
//
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
//
// Backend registry
//
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
GGML_API size_t ggml_backend_reg_get_count(void);
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
GGML_API const char * ggml_backend_reg_get_name(size_t i);
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
//
// Backend scheduler
//
// The backend scheduler allows for multiple backends to be used together
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
// The backends are selected based on:
// - the backend that supports the operation
// - the location of the pre-allocated tensors (e.g. the weights)
/*
Example usage:
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
// preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
// initialize buffers from a max size graph (optional)
reserve_graph = build_graph(sched, max_batch_size);
// manually assign nodes to a backend (optional, should not be needed in most cases)
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
ggml_backend_sched_reserve(sched, reserve_graph);
// compute
graph = build_graph(sched);
ggml_backend_sched_graph_compute(sched, graph);
// if there are graph inputs:
ggml_backend_sched_reset(sched);
ggml_backend_sched_alloc_graph(sched, graph);
ggml_backend_tensor_set(input_tensor, ...);
ggml_backend_sched_graph_compute(sched, graph);
}
*/
struct ggml_backend_sched;
typedef struct ggml_backend_sched * ggml_backend_sched_t;
// when ask == true, the scheduler wants to know if the user wants to observe this node
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
//
// when ask == false, the scheduler is passing the node tensor to the user for observation
// if the user returns false, the scheduler will cancel the graph compute
//
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
// Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
// Reset all assignments and allocators - must be called before changing the node backends
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
// Set a callback to be called for each resulting node during graph compute
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
//
// Utils
//
struct ggml_backend_graph_copy {
ggml_backend_buffer_t buffer;
struct ggml_context * ctx_allocated;
struct ggml_context * ctx_unallocated;
struct ggml_cgraph * graph;
};
// Copy a graph to a different backend
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
// Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,43 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS"
#else
#define GGML_CUDA_NAME "CUDA"
#define GGML_CUBLAS_NAME "cuBLAS"
#endif
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_CUDA_MAX_DEVICES 16
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,272 @@
#pragma once
#include "ggml.h"
// GGML internal header
#include <assert.h>
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
#include <stddef.h>
#include <stdbool.h>
#include <string.h> // memcpy
#include <math.h> // fabsf
#ifdef __cplusplus
extern "C" {
#endif
// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976
#ifndef __cplusplus
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
#ifndef __FMA__
#define __FMA__
#endif
#ifndef __F16C__
#define __F16C__
#endif
#endif
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
#ifndef __SSE3__
#define __SSE3__
#endif
#ifndef __SSSE3__
#define __SSSE3__
#endif
#endif
// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#if defined(__ARM_NEON) && !defined(_MSC_VER)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
typedef __fp16 ggml_fp16_internal_t;
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
ggml_fp16_internal_t tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
ggml_fp16_internal_t tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}
#else
typedef uint16_t ggml_fp16_internal_t;
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#else
#ifdef __POWER9_VECTOR__
#include <altivec.h>
#undef bool
#define bool _Bool
#else
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
#if !defined(__riscv)
#include <immintrin.h>
#endif
#endif
#endif
#endif
#endif
#ifdef __riscv_v_intrinsic
#include <riscv_vector.h>
#endif
#ifdef __F16C__
#ifdef _MSC_VER
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
#else
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#endif
#elif defined(__POWER9_VECTOR__)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
/* the inline asm below is about 12% faster than the lookup method */
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
register float f;
register double d;
__asm__(
"mtfprd %0,%2\n"
"xscvhpdp %0,%0\n"
"frsp %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=f"(f):
/* in */ "r"(h));
return f;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
register double d;
register ggml_fp16_t r;
__asm__( /* xscvdphp can work on double or single precision */
"xscvdphp %0,%2\n"
"mffprd %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=r"(r):
/* in */ "f"(f));
return r;
}
#else
// FP16 <-> FP32
// ref: https://github.com/Maratyszcza/FP16
static inline float fp32_from_bits(uint32_t w) {
union {
uint32_t as_bits;
float as_value;
} fp32;
fp32.as_bits = w;
return fp32.as_value;
}
static inline uint32_t fp32_to_bits(float f) {
union {
float as_value;
uint32_t as_bits;
} fp32;
fp32.as_value = f;
return fp32.as_bits;
}
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float exp_scale = 0x1.0p-112f;
#else
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
#endif
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
const uint32_t magic_mask = UINT32_C(126) << 23;
const float magic_bias = 0.5f;
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
const uint32_t result = sign |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
return fp32_from_bits(result);
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;
#else
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
#endif
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
const uint32_t w = fp32_to_bits(f);
const uint32_t shl1_w = w + w;
const uint32_t sign = w & UINT32_C(0x80000000);
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
if (bias < UINT32_C(0x71000000)) {
bias = UINT32_C(0x71000000);
}
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
const uint32_t bits = fp32_to_bits(base);
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // __F16C__
#endif // __ARM_NEON
// precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init()
extern float ggml_table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return ggml_table_f32_f16[s];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#endif
#if !defined(GGML_FP32_TO_FP16)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
#define GGML_HASHTABLE_FULL ((size_t)-1)
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
struct ggml_hash_set ggml_hash_set_new(size_t size);
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
// return index, asserts if table is full
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,46 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
struct ggml_vk_device {
int index;
int type; // same as VkPhysicalDeviceType
size_t heapSize;
const char * name;
const char * vendor;
int subgroupSize;
uint64_t bufferAlignment;
uint64_t maxAlloc;
};
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
bool ggml_vk_has_vulkan(void);
bool ggml_vk_has_device(void);
struct ggml_vk_device ggml_vk_current_device(void);
//
// backend API
//
// forward declaration
typedef struct ggml_backend * ggml_backend_t;
GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,66 @@
// An interface allowing to compute ggml_cgraph with Metal
//
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
//
// How it works?
//
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
//
// You only need to make sure that all memory buffers that you used during the graph creation
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
// used during the graph evaluation to determine the arguments of the compute kernels.
//
// Synchronization between device and host memory (for example for input and output tensors)
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
//
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include <stddef.h>
#include <stdbool.h>
// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 64
struct ggml_tensor;
struct ggml_cgraph;
#ifdef __cplusplus
extern "C" {
#endif
//
// backend API
// user-code should use only these functions
//
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
GGML_API ggml_backend_t ggml_backend_metal_init(void);
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family
// ideally, the user code should be doing these checks
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,36 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
GGML_API void ggml_cl_init(void);
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
// GGML_API void * ggml_cl_host_malloc(size_t size);
// GGML_API void ggml_cl_host_free(void * ptr);
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
// backend API
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,133 @@
#pragma once
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml.h"
// GGML internal header
#ifdef __cplusplus
extern "C" {
#endif
// Quantization
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
// Dequantization
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
// Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
void iq2xs_init_impl(enum ggml_type type);
void iq2xs_free_impl(enum ggml_type type);
void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,49 @@
//
// MIT license
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: MIT
//
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_SYCL_MAX_DEVICES 48
#define GGML_SYCL_NAME "SYCL"
// backend API
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
// devide buffer
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
// TODO: these are temporary
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
// SYCL doesn't support registering host memory, keep here for reference
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,29 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_VK_NAME "Vulkan"
#define GGML_VK_MAX_DEVICES 16
GGML_API void ggml_vk_instance_init(void);
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
#ifdef __cplusplus
}
#endif

View File

@ -1,14 +0,0 @@
ggml-metal-embed.o: \
ggml-metal.metal \
ggml-common.h
@echo "Embedding Metal library"
@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
$(eval TEMP_ASSEMBLY=$(shell mktemp))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
@$(AS) $(TEMP_ASSEMBLY) -o $@
@rm -f ${TEMP_ASSEMBLY}

File diff suppressed because it is too large Load Diff

View File

@ -3,13 +3,6 @@
#include "whisper.h" #include "whisper.h"
typedef struct {
VALUE *context;
VALUE user_data;
VALUE callback;
VALUE callbacks;
} ruby_whisper_callback_container;
typedef struct { typedef struct {
struct whisper_context *context; struct whisper_context *context;
} ruby_whisper; } ruby_whisper;
@ -17,9 +10,6 @@ typedef struct {
typedef struct { typedef struct {
struct whisper_full_params params; struct whisper_full_params params;
bool diarize; bool diarize;
ruby_whisper_callback_container *new_segment_callback_container;
ruby_whisper_callback_container *progress_callback_container;
ruby_whisper_callback_container *abort_callback_container;
} ruby_whisper_params; } ruby_whisper_params;
#endif #endif

View File

@ -1,29 +0,0 @@
---
- ../../src/whisper.cpp
- ../../include/whisper.h
- ../../ggml/src/ggml.c
- ../../ggml/src/ggml-impl.h
- ../../ggml/src/ggml-aarch64.h
- ../../ggml/src/ggml-aarch64.c
- ../../ggml/src/ggml-alloc.c
- ../../ggml/src/ggml-backend-impl.h
- ../../ggml/src/ggml-backend.cpp
- ../../ggml/src/ggml-common.h
- ../../ggml/src/ggml-quants.h
- ../../ggml/src/ggml-quants.c
- ../../ggml/src/ggml-cpu-impl.h
- ../../ggml/src/ggml-metal.m
- ../../ggml/src/ggml-metal.metal
- ../../ggml/src/ggml-blas.cpp
- ../../ggml/include/ggml.h
- ../../ggml/include/ggml-alloc.h
- ../../ggml/include/ggml-backend.h
- ../../ggml/include/ggml-cuda.h
- ../../ggml/include/ggml-kompute.h
- ../../ggml/include/ggml-metal.h
- ../../ggml/include/ggml-sycl.h
- ../../ggml/include/ggml-vulkan.h
- ../../ggml/include/ggml-blas.h
- ../../scripts/get-flags.mk
- ../../examples/dr_wav.h
- ../../LICENSE

View File

@ -1,163 +0,0 @@
require "test/unit"
require "whisper"
class TestCallback < Test::Unit::TestCase
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
def setup
GC.start
@params = Whisper::Params.new
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
@audio = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
end
def test_new_segment_callback
@params.new_segment_callback = ->(context, state, n_new, user_data) {
assert_kind_of Integer, n_new
assert n_new > 0
assert_same @whisper, context
n_segments = context.full_n_segments
n_new.times do |i|
i_segment = n_segments - 1 + i
start_time = context.full_get_segment_t0(i_segment) * 10
end_time = context.full_get_segment_t1(i_segment) * 10
text = context.full_get_segment_text(i_segment)
assert_kind_of Integer, start_time
assert start_time >= 0
assert_kind_of Integer, end_time
assert end_time > 0
assert_match /ask not what your country can do for you, ask what you can do for your country/, text if i_segment == 0
end
}
@whisper.transcribe(@audio, @params)
end
def test_new_segment_callback_closure
search_word = "what"
@params.new_segment_callback = ->(context, state, n_new, user_data) {
n_segments = context.full_n_segments
n_new.times do |i|
i_segment = n_segments - 1 + i
text = context.full_get_segment_text(i_segment)
if text.include?(search_word)
t0 = context.full_get_segment_t0(i_segment)
t1 = context.full_get_segment_t1(i_segment)
raise "search word '#{search_word}' found at between #{t0} and #{t1}"
end
end
}
assert_raise RuntimeError do
@whisper.transcribe(@audio, @params)
end
end
def test_new_segment_callback_user_data
udata = Object.new
@params.new_segment_callback_user_data = udata
@params.new_segment_callback = ->(context, state, n_new, user_data) {
assert_same udata, user_data
}
@whisper.transcribe(@audio, @params)
end
def test_new_segment_callback_user_data_gc
@params.new_segment_callback_user_data = "My user data"
@params.new_segment_callback = ->(context, state, n_new, user_data) {
assert_equal "My user data", user_data
}
GC.start
assert_same @whisper, @whisper.transcribe(@audio, @params)
end
def test_progress_callback
first = nil
last = nil
@params.progress_callback = ->(context, state, progress, user_data) {
assert_kind_of Integer, progress
assert 0 <= progress && progress <= 100
assert_same @whisper, context
first = progress if first.nil?
last = progress
}
@whisper.transcribe(@audio, @params)
assert_equal 0, first
assert_equal 100, last
end
def test_progress_callback_user_data
udata = Object.new
@params.progress_callback_user_data = udata
@params.progress_callback = ->(context, state, n_new, user_data) {
assert_same udata, user_data
}
@whisper.transcribe(@audio, @params)
end
def test_on_progress
first = nil
last = nil
@params.on_progress do |progress|
assert_kind_of Integer, progress
assert 0 <= progress && progress <= 100
first = progress if first.nil?
last = progress
end
@whisper.transcribe(@audio, @params)
assert_equal 0, first
assert_equal 100, last
end
def test_abort_callback
i = 0
@params.abort_callback = ->(user_data) {
assert_nil user_data
i += 1
return false
}
@whisper.transcribe(@audio, @params)
assert i > 0
end
def test_abort_callback_abort
i = 0
@params.abort_callback = ->(user_data) {
i += 1
return i == 3
}
@whisper.transcribe(@audio, @params)
assert_equal 3, i
end
def test_abort_callback_user_data
udata = Object.new
@params.abort_callback_user_data = udata
yielded = nil
@params.abort_callback = ->(user_data) {
yielded = user_data
}
@whisper.transcribe(@audio, @params)
assert_same udata, yielded
end
def test_abort_on
do_abort = false
aborted_from_callback = false
@params.on_new_segment do |segment|
do_abort = true if segment.text.match? /ask/
end
i = 0
@params.abort_on do
i += 1
do_abort
end
@whisper.transcribe(@audio, @params)
assert i > 0
end
end

View File

@ -1,31 +0,0 @@
require 'test/unit'
require 'tempfile'
require 'tmpdir'
require 'shellwords'
class TestPackage < Test::Unit::TestCase
def test_build
Tempfile.create do |file|
assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
assert file.size > 0
assert_path_exist file.to_path
end
end
sub_test_case "Building binary on installation" do
def setup
system "rake", "build", exception: true
end
def test_install
match_data = `rake -Tbuild`.match(/(whispercpp-(.+)\.gem)/)
filename = match_data[1]
version = match_data[2]
basename = "whisper.#{RbConfig::CONFIG["DLEXT"]}"
Dir.mktmpdir do |dir|
system "gem", "install", "--install-dir", dir.shellescape, "pkg/#{filename.shellescape}", exception: true
assert_path_exist File.join(dir, "gems/whispercpp-#{version}/lib", basename)
end
end
end
end

View File

@ -1,155 +0,0 @@
require 'test/unit'
require 'whisper'
class TestParams < Test::Unit::TestCase
def setup
@params = Whisper::Params.new
end
def test_language
@params.language = "en"
assert_equal @params.language, "en"
@params.language = "auto"
assert_equal @params.language, "auto"
end
def test_offset
@params.offset = 10_000
assert_equal @params.offset, 10_000
@params.offset = 0
assert_equal @params.offset, 0
end
def test_duration
@params.duration = 60_000
assert_equal @params.duration, 60_000
@params.duration = 0
assert_equal @params.duration, 0
end
def test_max_text_tokens
@params.max_text_tokens = 300
assert_equal @params.max_text_tokens, 300
@params.max_text_tokens = 0
assert_equal @params.max_text_tokens, 0
end
def test_translate
@params.translate = true
assert @params.translate
@params.translate = false
assert !@params.translate
end
def test_no_context
@params.no_context = true
assert @params.no_context
@params.no_context = false
assert !@params.no_context
end
def test_single_segment
@params.single_segment = true
assert @params.single_segment
@params.single_segment = false
assert !@params.single_segment
end
def test_print_special
@params.print_special = true
assert @params.print_special
@params.print_special = false
assert !@params.print_special
end
def test_print_progress
@params.print_progress = true
assert @params.print_progress
@params.print_progress = false
assert !@params.print_progress
end
def test_print_realtime
@params.print_realtime = true
assert @params.print_realtime
@params.print_realtime = false
assert !@params.print_realtime
end
def test_print_timestamps
@params.print_timestamps = true
assert @params.print_timestamps
@params.print_timestamps = false
assert !@params.print_timestamps
end
def test_suppress_blank
@params.suppress_blank = true
assert @params.suppress_blank
@params.suppress_blank = false
assert !@params.suppress_blank
end
def test_suppress_non_speech_tokens
@params.suppress_non_speech_tokens = true
assert @params.suppress_non_speech_tokens
@params.suppress_non_speech_tokens = false
assert !@params.suppress_non_speech_tokens
end
def test_token_timestamps
@params.token_timestamps = true
assert @params.token_timestamps
@params.token_timestamps = false
assert !@params.token_timestamps
end
def test_split_on_word
@params.split_on_word = true
assert @params.split_on_word
@params.split_on_word = false
assert !@params.split_on_word
end
def test_initial_prompt
assert_nil @params.initial_prompt
@params.initial_prompt = "You are a polite person."
assert_equal "You are a polite person.", @params.initial_prompt
end
def test_temperature
assert_equal 0.0, @params.temperature
@params.temperature = 0.5
assert_equal 0.5, @params.temperature
end
def test_max_initial_ts
assert_equal 1.0, @params.max_initial_ts
@params.max_initial_ts = 600.0
assert_equal 600.0, @params.max_initial_ts
end
def test_length_penalty
assert_equal -1.0, @params.length_penalty
@params.length_penalty = 0.5
assert_equal 0.5, @params.length_penalty
end
def test_temperature_inc
assert_in_delta 0.2, @params.temperature_inc
@params.temperature_inc = 0.5
assert_in_delta 0.5, @params.temperature_inc
end
def test_entropy_thold
assert_in_delta 2.4, @params.entropy_thold
@params.entropy_thold = 3.0
assert_in_delta 3.0, @params.entropy_thold
end
def test_logprob_thold
assert_in_delta -1.0, @params.logprob_thold
@params.logprob_thold = -0.5
assert_in_delta -0.5, @params.logprob_thold
end
end

View File

@ -1,87 +0,0 @@
require "test/unit"
require "whisper"
class TestSegment < Test::Unit::TestCase
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
class << self
attr_reader :whisper
def startup
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new
params.print_timestamps = false
jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
@whisper.transcribe(jfk, params)
end
end
def test_iteration
whisper.each_segment do |segment|
assert_instance_of Whisper::Segment, segment
end
end
def test_enumerator
enum = whisper.each_segment
assert_instance_of Enumerator, enum
enum.to_a.each_with_index do |segment, index|
assert_instance_of Whisper::Segment, segment
assert_kind_of Integer, index
end
end
def test_start_time
i = 0
whisper.each_segment do |segment|
assert_equal 0, segment.start_time if i == 0
i += 1
end
end
def test_end_time
i = 0
whisper.each_segment do |segment|
assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
i += 1
end
end
def test_on_new_segment
params = Whisper::Params.new
seg = nil
index = 0
params.on_new_segment do |segment|
assert_instance_of Whisper::Segment, segment
if index == 0
seg = segment
assert_equal 0, segment.start_time
assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
end
index += 1
end
whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
assert_equal 0, seg.start_time
assert_match /ask not what your country can do for you, ask what you can do for your country/, seg.text
end
def test_on_new_segment_twice
params = Whisper::Params.new
seg = nil
params.on_new_segment do |segment|
seg = segment
return
end
params.on_new_segment do |segment|
assert_same seg, segment
return
end
whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
end
private
def whisper
self.class.whisper
end
end

View File

@ -1,13 +1,122 @@
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
EXTDIR = File.join(TOPDIR, 'ext')
#$LIBDIR = File.join(TOPDIR, 'lib')
#$:.unshift(LIBDIR)
$:.unshift(EXTDIR)
require 'whisper' require 'whisper'
require 'test/unit' require 'test/unit'
class TestWhisper < Test::Unit::TestCase class TestWhisper < Test::Unit::TestCase
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
def setup def setup
@params = Whisper::Params.new @params = Whisper::Params.new
end end
def test_language
@params.language = "en"
assert_equal @params.language, "en"
@params.language = "auto"
assert_equal @params.language, "auto"
end
def test_offset
@params.offset = 10_000
assert_equal @params.offset, 10_000
@params.offset = 0
assert_equal @params.offset, 0
end
def test_duration
@params.duration = 60_000
assert_equal @params.duration, 60_000
@params.duration = 0
assert_equal @params.duration, 0
end
def test_max_text_tokens
@params.max_text_tokens = 300
assert_equal @params.max_text_tokens, 300
@params.max_text_tokens = 0
assert_equal @params.max_text_tokens, 0
end
def test_translate
@params.translate = true
assert @params.translate
@params.translate = false
assert !@params.translate
end
def test_no_context
@params.no_context = true
assert @params.no_context
@params.no_context = false
assert !@params.no_context
end
def test_single_segment
@params.single_segment = true
assert @params.single_segment
@params.single_segment = false
assert !@params.single_segment
end
def test_print_special
@params.print_special = true
assert @params.print_special
@params.print_special = false
assert !@params.print_special
end
def test_print_progress
@params.print_progress = true
assert @params.print_progress
@params.print_progress = false
assert !@params.print_progress
end
def test_print_realtime
@params.print_realtime = true
assert @params.print_realtime
@params.print_realtime = false
assert !@params.print_realtime
end
def test_print_timestamps
@params.print_timestamps = true
assert @params.print_timestamps
@params.print_timestamps = false
assert !@params.print_timestamps
end
def test_suppress_blank
@params.suppress_blank = true
assert @params.suppress_blank
@params.suppress_blank = false
assert !@params.suppress_blank
end
def test_suppress_non_speech_tokens
@params.suppress_non_speech_tokens = true
assert @params.suppress_non_speech_tokens
@params.suppress_non_speech_tokens = false
assert !@params.suppress_non_speech_tokens
end
def test_token_timestamps
@params.token_timestamps = true
assert @params.token_timestamps
@params.token_timestamps = false
assert !@params.token_timestamps
end
def test_split_on_word
@params.split_on_word = true
assert @params.split_on_word
@params.split_on_word = false
assert !@params.split_on_word
end
def test_whisper def test_whisper
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin')) @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new params = Whisper::Params.new
@ -19,81 +128,4 @@ class TestWhisper < Test::Unit::TestCase
} }
end end
sub_test_case "After transcription" do
class << self
attr_reader :whisper
def startup
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new
params.print_timestamps = false
jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
@whisper.transcribe(jfk, params)
end
end
def whisper
self.class.whisper
end
def test_full_n_segments
assert_equal 1, whisper.full_n_segments
end
def test_full_lang_id
assert_equal 0, whisper.full_lang_id
end
def test_full_get_segment_t0
assert_equal 0, whisper.full_get_segment_t0(0)
assert_raise IndexError do
whisper.full_get_segment_t0(whisper.full_n_segments)
end
assert_raise IndexError do
whisper.full_get_segment_t0(-1)
end
end
def test_full_get_segment_t1
t1 = whisper.full_get_segment_t1(0)
assert_kind_of Integer, t1
assert t1 > 0
assert_raise IndexError do
whisper.full_get_segment_t1(whisper.full_n_segments)
end
end
def test_full_get_segment_speaker_turn_next
assert_false whisper.full_get_segment_speaker_turn_next(0)
end
def test_full_get_segment_text
assert_match /ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0)
end
end
def test_lang_max_id
assert_kind_of Integer, Whisper.lang_max_id
end
def test_lang_id
assert_equal 0, Whisper.lang_id("en")
assert_raise ArgumentError do
Whisper.lang_id("non existing language")
end
end
def test_lang_str
assert_equal "en", Whisper.lang_str(0)
assert_raise IndexError do
Whisper.lang_str(Whisper.lang_max_id + 1)
end
end
def test_lang_str_full
assert_equal "english", Whisper.lang_str_full(0)
assert_raise IndexError do
Whisper.lang_str_full(Whisper.lang_max_id + 1)
end
end
end end

View File

@ -1,5 +1,3 @@
require "yaml"
Gem::Specification.new do |s| Gem::Specification.new do |s|
s.name = "whispercpp" s.name = "whispercpp"
s.authors = ["Georgi Gerganov", "Todd A. Fisher"] s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
@ -9,16 +7,10 @@ Gem::Specification.new do |s|
s.email = 'todd.fisher@gmail.com' s.email = 'todd.fisher@gmail.com'
s.extra_rdoc_files = ['LICENSE', 'README.md'] s.extra_rdoc_files = ['LICENSE', 'README.md']
s.files = `git ls-files . -z`.split("\x0") + s.files = ["LICENSE", "README.md", "Rakefile", "ext/extconf.rb", "ext/ggml.c", "ext/ruby_whisper.cpp", "ext/whisper.cpp", "ext/dr_wav.h", "ext/ggml.h", "ext/ruby_whisper.h", "ext/whisper.h"]
YAML.load_file("extsources.yaml").collect {|file|
basename = File.basename(file)
if s.extra_rdoc_files.include?(basename)
basename
else
File.join("ext", basename)
end
}
#### Load-time details
s.require_paths = ['lib','ext']
s.summary = %q{Ruby whisper.cpp bindings} s.summary = %q{Ruby whisper.cpp bindings}
s.test_files = ["tests/test_whisper.rb"] s.test_files = ["tests/test_whisper.rb"]

View File

@ -1,7 +1,7 @@
set(WHISPER_VERSION @WHISPER_INSTALL_VERSION@) set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
set(WHISPER_BUILD_COMMIT @WHISPER_BUILD_COMMIT@) set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
set(WHISPER_BUILD_NUMBER @WHISPER_BUILD_NUMBER@) set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
set(WHISPER_SHARED_LIB @BUILD_SHARED_LIBS@) set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(GGML_BLAS @GGML_BLAS@) set(GGML_BLAS @GGML_BLAS@)
set(GGML_CUDA @GGML_CUDA@) set(GGML_CUDA @GGML_CUDA@)
@ -11,9 +11,9 @@ set(GGML_ACCELERATE @GGML_ACCELERATE@)
@PACKAGE_INIT@ @PACKAGE_INIT@
set_and_check(WHISPER_INCLUDE_DIR "@PACKAGE_WHISPER_INCLUDE_INSTALL_DIR@") set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
set_and_check(WHISPER_LIB_DIR "@PACKAGE_WHISPER_LIB_INSTALL_DIR@") set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
set_and_check(WHISPER_BIN_DIR "@PACKAGE_WHISPER_BIN_INSTALL_DIR@") set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
# Ensure transient dependencies satisfied # Ensure transient dependencies satisfied
@ -43,23 +43,23 @@ if (GGML_HIPBLAS)
find_package(rocblas REQUIRED) find_package(rocblas REQUIRED)
endif() endif()
find_library(whisper_LIBRARY whisper find_library(llama_LIBRARY llama
REQUIRED REQUIRED
HINTS ${WHISPER_LIB_DIR}) HINTS ${LLAMA_LIB_DIR})
set(_whisper_link_deps "Threads::Threads" "@WHISPER_EXTRA_LIBS@") set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
set(_whisper_transient_defines "@WHISPER_TRANSIENT_DEFINES@") set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
add_library(whisper UNKNOWN IMPORTED) add_library(llama UNKNOWN IMPORTED)
set_target_properties(whisper set_target_properties(llama
PROPERTIES PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${WHISPER_INCLUDE_DIR}" INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "${_whisper_link_deps}" INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
INTERFACE_COMPILE_DEFINITIONS "${_whisper_transient_defines}" INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX" IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${whisper_LIBRARY}" IMPORTED_LOCATION "${llama_LIBRARY}"
INTERFACE_COMPILE_FEATURES cxx_std_11 INTERFACE_COMPILE_FEATURES cxx_std_11
POSITION_INDEPENDENT_CODE ON ) POSITION_INDEPENDENT_CODE ON )
check_required_components(whisper) check_required_components(Llama)

View File

@ -1,6 +1,6 @@
prefix=@CMAKE_INSTALL_PREFIX@ prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix} exec_prefix=${prefix}
libdir=@CMAKE_INSTALL_FULL_LIBDIR@ libdir=${exec_prefix}/lib
includedir=${prefix}/include includedir=${prefix}/include
Name: whisper Name: whisper

View File

@ -102,8 +102,8 @@ if (EMSCRIPTEN)
set_target_properties(libstream PROPERTIES FOLDER "libs") set_target_properties(libstream PROPERTIES FOLDER "libs")
add_subdirectory(command.wasm) add_subdirectory(command.wasm)
set_target_properties(libcommand PROPERTIES FOLDER "libs") set_target_properties(libcommand PROPERTIES FOLDER "libs")
#add_subdirectory(talk.wasm) add_subdirectory(talk.wasm)
#set_target_properties(libtalk PROPERTIES FOLDER "libs") set_target_properties(libtalk PROPERTIES FOLDER "libs")
add_subdirectory(bench.wasm) add_subdirectory(bench.wasm)
set_target_properties(libbench PROPERTIES FOLDER "libs") set_target_properties(libbench PROPERTIES FOLDER "libs")
elseif(CMAKE_JS_VERSION) elseif(CMAKE_JS_VERSION)
@ -127,10 +127,8 @@ endif (WHISPER_SDL2)
add_subdirectory(quantize) add_subdirectory(quantize)
set_target_properties(quantize PROPERTIES FOLDER "examples") set_target_properties(quantize PROPERTIES FOLDER "examples")
if (WHISPER_SDL2) if (WHISPER_SDL2)
# TODO: disabled until update add_subdirectory(talk)
# https://github.com/ggerganov/whisper.cpp/issues/1818 set_target_properties(talk PROPERTIES FOLDER "examples")
#add_subdirectory(talk)
#set_target_properties(talk PROPERTIES FOLDER "examples")
add_subdirectory(talk-llama) add_subdirectory(talk-llama)
set_target_properties(talk-llama PROPERTIES FOLDER "examples") set_target_properties(talk-llama PROPERTIES FOLDER "examples")
add_subdirectory(lsp) add_subdirectory(lsp)

View File

@ -72,9 +72,6 @@ bool ggml_common_quantize_0(
case GGML_FTYPE_MOSTLY_IQ4_XS: case GGML_FTYPE_MOSTLY_IQ4_XS:
case GGML_FTYPE_MOSTLY_IQ1_M: case GGML_FTYPE_MOSTLY_IQ1_M:
case GGML_FTYPE_MOSTLY_BF16: case GGML_FTYPE_MOSTLY_BF16:
case GGML_FTYPE_MOSTLY_Q4_0_4_4:
case GGML_FTYPE_MOSTLY_Q4_0_4_8:
case GGML_FTYPE_MOSTLY_Q4_0_8_8:
{ {
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false; return false;
@ -212,11 +209,6 @@ bool ggml_common_quantize_0(
case GGML_TYPE_IQ4_XS: case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ1_M: case GGML_TYPE_IQ1_M:
case GGML_TYPE_BF16: case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_COUNT: case GGML_TYPE_COUNT:
{ {
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype)); fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));

View File

@ -147,6 +147,7 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
case 7: return "He"; case 7: return "He";
case 8: return "She"; case 8: return "She";
case 9: return "They"; case 9: return "They";
default: return "To";
} }
return "The"; return "The";

View File

@ -9,7 +9,6 @@
#include <thread> #include <thread>
#include <ctime> #include <ctime>
#include <fstream> #include <fstream>
#include <sstream>
#define COMMON_SAMPLE_RATE 16000 #define COMMON_SAMPLE_RATE 16000
@ -287,43 +286,12 @@ void sam_print_usage(int argc, char ** argv, const sam_params & params);
// Terminal utils // Terminal utils
// //
#define SQR(X) ((X) * (X))
#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
/** // Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
* Quantizes 24-bit RGB to xterm256 code range [16,256). // Lowest is red, middle is yellow, highest is green.
*/
static int rgb2xterm256(int r, int g, int b) {
unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
int av, ir, ig, ib, il, qr, qg, qb, ql;
av = r * .299 + g * .587 + b * .114 + .5;
ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
qr = cube[(ir = UNCUBE(r))];
qg = cube[(ig = UNCUBE(g))];
qb = cube[(ib = UNCUBE(b))];
if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
return ir * 36 + ig * 6 + ib + 020;
return il + 0350;
}
static std::string set_xterm256_foreground(int r, int g, int b) {
int x = rgb2xterm256(r, g, b);
std::ostringstream oss;
oss << "\033[38;5;" << x << "m";
return oss.str();
}
// Lowest is red, middle is yellow, highest is green. Color scheme from
// Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
const std::vector<std::string> k_colors = { const std::vector<std::string> k_colors = {
set_xterm256_foreground(220, 5, 12), "\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
set_xterm256_foreground(232, 96, 28), "\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
set_xterm256_foreground(241, 147, 45),
set_xterm256_foreground(246, 193, 65),
set_xterm256_foreground(247, 240, 86),
set_xterm256_foreground(144, 201, 135),
set_xterm256_foreground( 78, 178, 101),
}; };
// //

File diff suppressed because it is too large Load Diff

View File

@ -321,7 +321,7 @@ int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_da
LOG("Couldn't map input file %s\n", ifname.c_str()); LOG("Couldn't map input file %s\n", ifname.c_str());
return err; return err;
} }
LOG("Mapped input file: %s size: %d\n", ibuf, (int) ibuf_size); LOG("Mapped input file: %x size: %d\n", ibuf, ibuf_size);
struct audio_buffer inaudio_buf; struct audio_buffer inaudio_buf;
inaudio_buf.ptr = ibuf; inaudio_buf.ptr = ibuf;
inaudio_buf.size = ibuf_size; inaudio_buf.size = ibuf_size;

View File

@ -48,7 +48,7 @@ if [ -n "$3" ]; then
fi fi
# Whisper models # Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" "large-v3-turbo" ) models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
# list available models # list available models
function list_models { function list_models {

View File

@ -997,7 +997,6 @@ int main(int argc, char ** argv) {
if (params.dtw == "large.v1") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1; if (params.dtw == "large.v1") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
if (params.dtw == "large.v2") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2; if (params.dtw == "large.v2") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
if (params.dtw == "large.v3") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3; if (params.dtw == "large.v3") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
if (params.dtw == "large.v3.turbo") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3_TURBO;
if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) { if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str()); fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());

View File

@ -21,7 +21,7 @@ def process_audio(wav_file, model_name="base.en"):
if not os.path.exists(wav_file): if not os.path.exists(wav_file):
raise FileNotFoundError(f"WAV file not found: {wav_file}") raise FileNotFoundError(f"WAV file not found: {wav_file}")
full_command = f"./main -m {model} -f {wav_file} -nt" full_command = f"./main -m {model} -f {wav_file} -np -nt"
# Execute the command # Execute the command
process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

View File

@ -34,7 +34,6 @@ struct server_params
std::string hostname = "127.0.0.1"; std::string hostname = "127.0.0.1";
std::string public_path = "examples/server/public"; std::string public_path = "examples/server/public";
std::string request_path = ""; std::string request_path = "";
std::string inference_path = "/inference";
int32_t port = 8080; int32_t port = 8080;
int32_t read_timeout = 600; int32_t read_timeout = 600;
@ -133,7 +132,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port); fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str()); fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
fprintf(stderr, " --request-path PATH, [%-7s] Request path for all requests\n", sparams.request_path.c_str()); fprintf(stderr, " --request-path PATH, [%-7s] Request path for all requests\n", sparams.request_path.c_str());
fprintf(stderr, " --inference-path PATH, [%-7s] Inference path for all requests\n", sparams.inference_path.c_str());
fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false"); fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
@ -184,7 +182,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
else if ( arg == "--host") { sparams.hostname = argv[++i]; } else if ( arg == "--host") { sparams.hostname = argv[++i]; }
else if ( arg == "--public") { sparams.public_path = argv[++i]; } else if ( arg == "--public") { sparams.public_path = argv[++i]; }
else if ( arg == "--request-path") { sparams.request_path = argv[++i]; } else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; } else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
else { else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -219,7 +216,7 @@ void check_ffmpeg_availibility() {
bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) { bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
std::ostringstream cmd_stream; std::ostringstream cmd_stream;
std::string converted_filename_temp = temp_filename + "_temp.wav"; std::string converted_filename_temp = temp_filename + "_temp.wav";
cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1"; cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
std::string cmd = cmd_stream.str(); std::string cmd = cmd_stream.str();
int status = std::system(cmd.c_str()); int status = std::system(cmd.c_str());
@ -647,10 +644,10 @@ int main(int argc, char ** argv) {
return false; return false;
}); });
svr.Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){ svr.Options(sparams.request_path + "/inference", [&](const Request &, Response &){
}); });
svr.Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){ svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
// acquire whisper model mutex lock // acquire whisper model mutex lock
std::lock_guard<std::mutex> lock(whisper_mutex); std::lock_guard<std::mutex> lock(whisper_mutex);
@ -677,8 +674,7 @@ int main(int argc, char ** argv) {
if (sparams.ffmpeg_converter) { if (sparams.ffmpeg_converter) {
// if file is not wav, convert to wav // if file is not wav, convert to wav
// write to temporary file // write to temporary file
const std::string temp_filename_base = std::tmpnam(nullptr); const std::string temp_filename = "whisper_server_temp_file.wav";
const std::string temp_filename = temp_filename_base + ".wav";
std::ofstream temp_file{temp_filename, std::ios::binary}; std::ofstream temp_file{temp_filename, std::ios::binary};
temp_file << audio_file.content; temp_file << audio_file.content;
temp_file.close(); temp_file.close();

View File

@ -1,13 +1,7 @@
if (WHISPER_SDL2) if (WHISPER_SDL2)
# talk-llama # talk-llama
set(TARGET talk-llama) set(TARGET talk-llama)
add_executable(${TARGET} talk-llama.cpp add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
llama.cpp
llama-vocab.cpp
llama-grammar.cpp
llama-sampling.cpp
unicode.cpp
unicode-data.cpp)
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS}) target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
if (WHISPER_CLBLAST) if (WHISPER_CLBLAST)

File diff suppressed because it is too large Load Diff

View File

@ -1,144 +0,0 @@
#pragma once
#include "llama-impl.h"
#include <map>
struct llama_vocab;
// grammar element type
enum llama_gretype {
// end of rule definition
LLAMA_GRETYPE_END = 0,
// start of alternate definition for rule
LLAMA_GRETYPE_ALT = 1,
// non-terminal element: reference to rule
LLAMA_GRETYPE_RULE_REF = 2,
// terminal element: character (code point)
LLAMA_GRETYPE_CHAR = 3,
// inverse char(s) ([^a], [^a-b] [^abc])
LLAMA_GRETYPE_CHAR_NOT = 4,
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
// be an inclusive range ([a-z])
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
// modifies a preceding LLAMA_GRETYPE_CHAR or
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
LLAMA_GRETYPE_CHAR_ALT = 6,
// any character (.)
LLAMA_GRETYPE_CHAR_ANY = 7,
};
typedef struct llama_grammar_element {
enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID
} llama_grammar_element;
struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};
struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
};
using llama_grammar_rule = std::vector< llama_grammar_element>;
using llama_grammar_stack = std::vector<const llama_grammar_element *>;
using llama_grammar_rules = std::vector<llama_grammar_rule>;
using llama_grammar_stacks = std::vector<llama_grammar_stack>;
using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
// takes a set of possible pushdown stacks on a grammar, which are required to
// be positioned at a character range (see `llama_grammar_advance_stack`), and
// produces the N possible stacks if the given char is accepted at those
// positions
void llama_grammar_accept(
const llama_grammar_rules & rules,
const llama_grammar_stacks & stacks,
uint32_t chr,
llama_grammar_stacks & stacks_new);
std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
const llama_grammar_rules & rules,
const llama_grammar_stack & stack,
const llama_grammar_candidates & candidates);
struct llama_grammar_parser {
std::map<std::string, uint32_t> symbol_ids;
llama_grammar_rules rules;
llama_grammar_stack c_rules() const;
uint32_t get_symbol_id(const char * src, size_t len);
uint32_t generate_symbol_id(const std::string & base_name);
void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
const char * parse_alternates(
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested);
const char * parse_sequence(
const char * src,
const std::string & rule_name,
llama_grammar_rule & rule,
bool is_nested);
const char * parse_rule(const char * src);
bool parse(const char * src);
void print(FILE * file);
};
struct llama_grammar {
// note: allow null vocab for testing (not great)
const llama_vocab * vocab;
const llama_grammar_rules rules; // TODO: shared ptr
llama_grammar_stacks stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
//
// internal API
//
// note: needed for tests (not great)
struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab,
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index);
struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
void llama_grammar_free_impl(struct llama_grammar * grammar);
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);
// TODO: move the API below as member functions of llama_grammar
void llama_grammar_apply_impl(
const struct llama_grammar & grammar,
llama_token_data_array * cur_p);
void llama_grammar_accept_impl(
struct llama_grammar & grammar,
llama_token token);

View File

@ -1,181 +0,0 @@
#pragma once
#include "llama.h"
#include <string>
#include <vector>
#include <stdexcept>
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_ATTRIBUTE_FORMAT(...)
#endif
//
// logging
//
LLAMA_ATTRIBUTE_FORMAT(2, 3)
void llama_log_internal (ggml_log_level level, const char * format, ...);
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LLAMA_LOG_CONT(...) llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
//
// helpers
//
struct time_meas {
time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
~time_meas() {
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
}
const int64_t t_start_us;
int64_t & t_acc;
};
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
// the ring buffer works similarly to std::deque, but with a fixed capacity
template<typename T>
struct ring_buffer {
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
T & front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
const T & front() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
T & back() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
const T & back() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
void push_back(const T & value) {
if (capacity == 0) {
throw std::runtime_error("ring buffer: capacity is zero");
}
if (sz == capacity) {
// advance the start when buffer is full
first = (first + 1) % capacity;
} else {
sz++;
}
data[pos] = value;
pos = (pos + 1) % capacity;
}
T pop_front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
T value = data[first];
first = (first + 1) % capacity;
sz--;
return value;
}
//T & operator[](size_t i) {
// if (i >= sz) {
// throw std::runtime_error("ring buffer: index out of bounds");
// }
// return data[(first + i) % capacity];
//}
//const T & at(size_t i) const {
// if (i >= sz) {
// throw std::runtime_error("ring buffer: index out of bounds");
// }
// return data[(first + i) % capacity];
//}
const T & rat(size_t i) const {
if (i >= sz) {
throw std::runtime_error("ring buffer: index out of bounds");
}
return data[(first + sz - i - 1) % capacity];
}
std::vector<T> to_vector() const {
std::vector<T> result;
result.reserve(sz);
for (size_t i = 0; i < sz; i++) {
result.push_back(data[(first + i) % capacity]);
}
return result;
}
void clear() {
// here only reset the status of the buffer
sz = 0;
first = 0;
pos = 0;
}
bool empty() const {
return sz == 0;
}
size_t size() const {
return sz;
}
size_t capacity = 0;
size_t sz = 0;
size_t first = 0;
size_t pos = 0;
std::vector<T> data;
};

File diff suppressed because it is too large Load Diff

View File

@ -1,29 +0,0 @@
#pragma once
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
#include "llama-grammar.h"
#include <unordered_map>
struct llama_vocab;
struct llama_grammar;
// sampler chain
struct llama_sampler_chain {
llama_sampler_chain_params params;
std::vector<struct llama_sampler *> samplers;
// timing
mutable int64_t t_sample_us;
mutable int32_t n_sample;
};
struct llama_sampler * llama_sampler_init_grammar_impl(
const struct llama_vocab & vocab,
const char * grammar_str,
const char * grammar_root);

File diff suppressed because it is too large Load Diff

View File

@ -1,146 +0,0 @@
#pragma once
#include "llama-impl.h"
#include <string>
#include <vector>
#include <unordered_map>
#include <map>
#include <set>
struct llm_tokenizer;
struct llama_vocab {
using id = llama_token;
using token = std::string;
using tattr = llama_token_attr;
struct token_data {
token text;
float score;
tattr attr;
};
uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::vector<id> cache_special_tokens;
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
id special_bos_id = 1;
id special_eos_id = 2;
id special_unk_id = 0;
id special_sep_id = -1;
id special_pad_id = -1;
id special_cls_id = -1;
id special_mask_id = -1;
id linefeed_id = 13;
id special_prefix_id = -1;
id special_suffix_id = -1;
id special_middle_id = -1;
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
id special_eom_id = -1;
// set of all tokens that cause "end of generation"
std::set<id> special_eog_ids;
// tokenizer flags
bool tokenizer_add_space_prefix = false;
bool tokenizer_add_bos = false;
bool tokenizer_add_eos = false;
bool tokenizer_ignore_merges = false;
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
bool tokenizer_remove_extra_whitespaces = false;
bool tokenizer_escape_whitespaces = true;
bool tokenizer_treat_whitespace_as_suffix = false;
std::vector<char> precompiled_charsmap;
llm_tokenizer * tokenizer = nullptr;
llama_vocab() = default;
~llama_vocab();
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
void init_tokenizer();
};
//
// internal API
//
// TODO: rename to llama_tokenize_impl
// TODO: This should probably be in llama.h
std::vector<llama_vocab::id> llama_tokenize_internal(
const llama_vocab & vocab,
std::string raw_text,
bool add_special,
bool parse_special = false);
// TODO: move the API below as member functions of llama_vocab
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special);
// does not write null-terminator to buf
int32_t llama_token_to_piece_impl(
const struct llama_vocab & vocab,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special);
int32_t llama_detokenize_impl(
const struct llama_vocab & vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special);

File diff suppressed because it is too large Load Diff

View File

@ -33,18 +33,17 @@
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF #define LLAMA_DEFAULT_SEED 0xFFFFFFFF
// TODO: use everywhere in the implementation #define LLAMA_MAX_RNG_STATE (64*1024)
#define LLAMA_TOKEN_NULL -1
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' #define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn' #define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq' #define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN #define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 9 #define LLAMA_SESSION_VERSION 6
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ #define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
#define LLAMA_STATE_SEQ_VERSION 2 #define LLAMA_STATE_SEQ_VERSION 1
#ifdef __cplusplus #ifdef __cplusplus
extern "C" { extern "C" {
@ -56,10 +55,8 @@ extern "C" {
// TODO: show sample usage // TODO: show sample usage
// //
// struct llama_vocab; // TODO: add in the future
struct llama_model; struct llama_model;
struct llama_context; struct llama_context;
struct llama_sampler;
typedef int32_t llama_pos; typedef int32_t llama_pos;
typedef int32_t llama_token; typedef int32_t llama_token;
@ -70,8 +67,6 @@ extern "C" {
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
}; };
// pre-tokenization types // pre-tokenization types
@ -92,23 +87,15 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_DBRX = 13, LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14, LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15, LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
}; };
// note: these values should be synchronized with ggml_rope
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
enum llama_rope_type { enum llama_rope_type {
LLAMA_ROPE_TYPE_NONE = -1, LLAMA_ROPE_TYPE_NONE = -1,
LLAMA_ROPE_TYPE_NORM = 0, LLAMA_ROPE_TYPE_NORM = 0,
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX, LLAMA_ROPE_TYPE_NEOX = 2,
LLAMA_ROPE_TYPE_GLM = 4,
}; };
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@ -141,7 +128,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
// LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed // LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed // LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
@ -170,11 +157,6 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
}; };
@ -193,13 +175,6 @@ extern "C" {
LLAMA_POOLING_TYPE_MEAN = 1, LLAMA_POOLING_TYPE_MEAN = 1,
LLAMA_POOLING_TYPE_CLS = 2, LLAMA_POOLING_TYPE_CLS = 2,
LLAMA_POOLING_TYPE_LAST = 3, LLAMA_POOLING_TYPE_LAST = 3,
LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
};
enum llama_attention_type {
LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
LLAMA_ATTENTION_TYPE_CAUSAL = 0,
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
}; };
enum llama_split_mode { enum llama_split_mode {
@ -208,7 +183,6 @@ extern "C" {
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
}; };
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
typedef struct llama_token_data { typedef struct llama_token_data {
llama_token id; // token id llama_token id; // token id
float logit; // log-odds of the token float logit; // log-odds of the token
@ -216,10 +190,8 @@ extern "C" {
} llama_token_data; } llama_token_data;
typedef struct llama_token_data_array { typedef struct llama_token_data_array {
// TODO: consider SoA
llama_token_data * data; llama_token_data * data;
size_t size; size_t size;
int64_t selected; // this is the index in the data array (i.e. not the token id)
bool sorted; bool sorted;
} llama_token_data_array; } llama_token_data_array;
@ -280,9 +252,9 @@ extern "C" {
enum llama_split_mode split_mode; // how to split the model across multiple GPUs enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// main_gpu interpretation depends on split_mode: // main_gpu interpretation depends on split_mode:
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model // LLAMA_SPLIT_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_MODE_LAYER: ignored // LLAMA_SPLIT_LAYER: ignored
int32_t main_gpu; int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices() // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@ -312,16 +284,16 @@ extern "C" {
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
// https://github.com/ggerganov/llama.cpp/pull/7544 // https://github.com/ggerganov/llama.cpp/pull/7544
struct llama_context_params { struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random
uint32_t n_ctx; // text context, 0 = from model uint32_t n_ctx; // text context, 0 = from model
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
int32_t n_threads; // number of threads to use for generation uint32_t n_threads; // number of threads to use for generation
int32_t n_threads_batch; // number of threads to use for batch processing uint32_t n_threads_batch; // number of threads to use for batch processing
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings
// ref: https://github.com/ggerganov/llama.cpp/pull/2054 // ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model float rope_freq_base; // RoPE base frequency, 0 = from model
@ -339,13 +311,11 @@ extern "C" {
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value. // Keep the booleans together to avoid misalignment during copy-by-value.
// TODO: move at the end of the struct
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embeddings; // if true, extract embeddings (together with logits) bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
// Abort callback // Abort callback
// if it returns true, execution of llama_decode() will be aborted // if it returns true, execution of llama_decode() will be aborted
@ -359,7 +329,7 @@ extern "C" {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency() int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type enum ggml_type token_embedding_type; // itoken embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@ -369,14 +339,56 @@ extern "C" {
void * kv_overrides; // pointer to vector containing overrides void * kv_overrides; // pointer to vector containing overrides
} llama_model_quantize_params; } llama_model_quantize_params;
typedef struct llama_logit_bias { // grammar types
llama_token token; struct llama_grammar;
float bias;
} llama_logit_bias;
typedef struct llama_sampler_chain_params { // grammar element type
bool no_perf; // whether to measure performance timings enum llama_gretype {
} llama_sampler_chain_params; // end of rule definition
LLAMA_GRETYPE_END = 0,
// start of alternate definition for rule
LLAMA_GRETYPE_ALT = 1,
// non-terminal element: reference to rule
LLAMA_GRETYPE_RULE_REF = 2,
// terminal element: character (code point)
LLAMA_GRETYPE_CHAR = 3,
// inverse char(s) ([^a], [^a-b] [^abc])
LLAMA_GRETYPE_CHAR_NOT = 4,
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
// be an inclusive range ([a-z])
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
// modifies a preceding LLAMA_GRETYPE_CHAR or
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
LLAMA_GRETYPE_CHAR_ALT = 6,
// any character (.)
LLAMA_GRETYPE_CHAR_ANY = 7,
};
typedef struct llama_grammar_element {
enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID
} llama_grammar_element;
// performance timing information
struct llama_timings {
double t_start_ms;
double t_end_ms;
double t_load_ms;
double t_sample_ms;
double t_p_eval_ms;
double t_eval_ms;
int32_t n_sample;
int32_t n_p_eval;
int32_t n_eval;
};
// used in chat template // used in chat template
typedef struct llama_chat_message { typedef struct llama_chat_message {
@ -384,14 +396,9 @@ extern "C" {
const char * content; const char * content;
} llama_chat_message; } llama_chat_message;
// lora adapter
struct llama_lora_adapter;
// Helpers for getting default parameters // Helpers for getting default parameters
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
LLAMA_API struct llama_model_params llama_model_default_params(void); LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void); LLAMA_API struct llama_context_params llama_context_default_params(void);
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
// Initialize the llama + ggml backend // Initialize the llama + ggml backend
@ -402,13 +409,6 @@ extern "C" {
//optional: //optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa); LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
// Optional: an auto threadpool gets created in ggml if not passed explicitly
LLAMA_API void llama_attach_threadpool(
struct llama_context * ctx,
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch);
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
// Call once at the end of the program - currently only used for MPI // Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void); LLAMA_API void llama_backend_free(void);
@ -418,7 +418,6 @@ extern "C" {
LLAMA_API void llama_free_model(struct llama_model * model); LLAMA_API void llama_free_model(struct llama_model * model);
// TODO: rename to llama_init_from_model
LLAMA_API struct llama_context * llama_new_context_with_model( LLAMA_API struct llama_context * llama_new_context_with_model(
struct llama_model * model, struct llama_model * model,
struct llama_context_params params); struct llama_context_params params);
@ -434,22 +433,22 @@ extern "C" {
LLAMA_API bool llama_supports_mlock (void); LLAMA_API bool llama_supports_mlock (void);
LLAMA_API bool llama_supports_gpu_offload(void); LLAMA_API bool llama_supports_gpu_offload(void);
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model); LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
LLAMA_API int32_t llama_n_embd (const struct llama_model * model); LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
LLAMA_API int32_t llama_n_layer (const struct llama_model * model); LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
LLAMA_API int32_t llama_n_head (const struct llama_model * model);
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
// Get the model's RoPE frequency scaling factor // Get the model's RoPE frequency scaling factor
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ -483,51 +482,24 @@ extern "C" {
// Get a llama model tensor // Get a llama model tensor
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name); LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
// Returns true if the model contains an encoder that requires llama_encode() call
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
// Returns true if the model contains a decoder that requires llama_decode() call
LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
// For encoder-decoder models, this function returns id of the token that must be provided
// to the decoder to start generating output sequence. For other models, it returns -1.
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
// Returns 0 on success // Returns 0 on success
LLAMA_API uint32_t llama_model_quantize( LLAMA_API uint32_t llama_model_quantize(
const char * fname_inp, const char * fname_inp,
const char * fname_out, const char * fname_out,
const llama_model_quantize_params * params); const llama_model_quantize_params * params);
// Load a LoRA adapter from file // Apply a LoRA adapter to a loaded model
// The loaded adapter will be associated to the given model, and will be free when the model is deleted // path_base_model is the path to a higher quality model to use as a base for
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init( // the layers modified by the adapter. Can be NULL to use the current loaded model.
struct llama_model * model, // The model needs to be reloaded before applying a new adapter, otherwise the adapter
const char * path_lora); // will be applied on top of the previous one
// Returns 0 on success
// Add a loaded LoRA adapter to given context LLAMA_API int32_t llama_model_apply_lora_from_file(
// This will not modify model's weight const struct llama_model * model,
LLAMA_API int32_t llama_lora_adapter_set( const char * path_lora,
struct llama_context * ctx, float scale,
struct llama_lora_adapter * adapter, const char * path_base_model,
float scale); int32_t n_threads);
// Remove a specific LoRA adapter from given context
// Return -1 if the adapter is not present in the context
LLAMA_API int32_t llama_lora_adapter_remove(
struct llama_context * ctx,
struct llama_lora_adapter * adapter);
// Remove all LoRA adapters from given context
LLAMA_API void llama_lora_adapter_clear(
struct llama_context * ctx);
// Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
// Apply a loaded control vector to a llama_context, or if data is NULL, clear // Apply a loaded control vector to a llama_context, or if data is NULL, clear
// the currently loaded vector. // the currently loaded vector.
@ -677,11 +649,10 @@ extern "C" {
// State / sessions // State / sessions
// //
// Returns the *actual* size in bytes of the state // Returns the maximum size in bytes of the state (rng, logits, embedding
// (logits, embedding and kv_cache) // and kv_cache) - will often be smaller after compacting tokens
// Only use when saving the state, not when restoring it, otherwise the size may be too small. LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx); LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
"use llama_state_get_size instead"); "use llama_state_get_size instead");
// Copies the state to the specified destination address. // Copies the state to the specified destination address.
@ -689,8 +660,7 @@ extern "C" {
// Returns the number of bytes copied // Returns the number of bytes copied
LLAMA_API size_t llama_state_get_data( LLAMA_API size_t llama_state_get_data(
struct llama_context * ctx, struct llama_context * ctx,
uint8_t * dst, uint8_t * dst);
size_t size);
LLAMA_API DEPRECATED(size_t llama_copy_state_data( LLAMA_API DEPRECATED(size_t llama_copy_state_data(
struct llama_context * ctx, struct llama_context * ctx,
uint8_t * dst), uint8_t * dst),
@ -700,8 +670,7 @@ extern "C" {
// Returns the number of bytes read // Returns the number of bytes read
LLAMA_API size_t llama_state_set_data( LLAMA_API size_t llama_state_set_data(
struct llama_context * ctx, struct llama_context * ctx,
const uint8_t * src, const uint8_t * src);
size_t size);
LLAMA_API DEPRECATED(size_t llama_set_state_data( LLAMA_API DEPRECATED(size_t llama_set_state_data(
struct llama_context * ctx, struct llama_context * ctx,
const uint8_t * src), const uint8_t * src),
@ -743,7 +712,6 @@ extern "C" {
LLAMA_API size_t llama_state_seq_get_data( LLAMA_API size_t llama_state_seq_get_data(
struct llama_context * ctx, struct llama_context * ctx,
uint8_t * dst, uint8_t * dst,
size_t size,
llama_seq_id seq_id); llama_seq_id seq_id);
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence // Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
@ -753,7 +721,6 @@ extern "C" {
LLAMA_API size_t llama_state_seq_set_data( LLAMA_API size_t llama_state_seq_set_data(
struct llama_context * ctx, struct llama_context * ctx,
const uint8_t * src, const uint8_t * src,
size_t size,
llama_seq_id dest_seq_id); llama_seq_id dest_seq_id);
LLAMA_API size_t llama_state_seq_save_file( LLAMA_API size_t llama_state_seq_save_file(
@ -800,14 +767,6 @@ extern "C" {
// Frees a batch of tokens allocated with llama_batch_init() // Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API void llama_batch_free(struct llama_batch batch); LLAMA_API void llama_batch_free(struct llama_batch batch);
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
// Stores the encoder output internally for later use by the decoder cross-attention layers.
// 0 - success
// < 0 - error
LLAMA_API int32_t llama_encode(
struct llama_context * ctx,
struct llama_batch batch);
// Positive return values does not mean a fatal error, but rather a warning. // Positive return values does not mean a fatal error, but rather a warning.
// 0 - success // 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context) // 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@ -819,13 +778,13 @@ extern "C" {
// Set the number of threads used for decoding // Set the number of threads used for decoding
// n_threads is the number of threads used for generation (single token) // n_threads is the number of threads used for generation (single token)
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch); LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
// Get the number of threads used for generation of a single token. // Get the number of threads used for generation of a single token.
LLAMA_API int32_t llama_n_threads(struct llama_context * ctx); LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
// Get the number of threads used for prompt and batch processing (multiple token). // Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx); LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings mode or not // Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not // If true, embeddings will be returned but logits will not
@ -873,8 +832,7 @@ extern "C" {
// Get the embeddings for a sequence id // Get the embeddings for a sequence id
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence // shape: [n_embd] (1-dimensional)
// otherwise: float[n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
// //
@ -899,10 +857,12 @@ extern "C" {
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
LLAMA_API bool llama_add_bos_token(const struct llama_model * model); // Returns -1 if unknown, 1 for true or 0 for false.
LLAMA_API bool llama_add_eos_token(const struct llama_model * model); LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
// Returns -1 if unknown, 1 for true or 0 for false.
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
// Codellama infill tokens // Codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@ -913,14 +873,11 @@ extern "C" {
// //
// Tokenization // Tokenization
// //
// The API is thread-safe.
//
/// @details Convert the provided text into tokens. /// @details Convert the provided text into tokens.
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens. /// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
/// @return Returns the number of tokens on success, no more than n_tokens_max /// @return Returns the number of tokens on success, no more than n_tokens_max
/// @return Returns a negative number on failure - the number of tokens that would have been returned /// @return Returns a negative number on failure - the number of tokens that would have been returned
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated /// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
/// as plaintext. Does not insert a leading space. /// as plaintext. Does not insert a leading space.
LLAMA_API int32_t llama_tokenize( LLAMA_API int32_t llama_tokenize(
@ -935,35 +892,15 @@ extern "C" {
// Token Id -> Piece. // Token Id -> Piece.
// Uses the vocabulary in the provided context. // Uses the vocabulary in the provided context.
// Does not write null terminator to the buffer. // Does not write null terminator to the buffer.
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix') // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
// @param special If true, special tokens are rendered in the output. // @param special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_token_to_piece( LLAMA_API int32_t llama_token_to_piece(
const struct llama_model * model, const struct llama_model * model,
llama_token token, llama_token token,
char * buf, char * buf,
int32_t length, int32_t length,
int32_t lstrip,
bool special); bool special);
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
/// @param text The char pointer must be large enough to hold the resulting text.
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
/// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
/// @param unparse_special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_detokenize(
const struct llama_model * model,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special);
//
// Chat templates
//
/// Apply chat template. Inspired by hf apply_chat_template() on python. /// Apply chat template. Inspired by hf apply_chat_template() on python.
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model" /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template /// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@ -984,114 +921,104 @@ extern "C" {
int32_t length); int32_t length);
// //
// Sampling API // Grammar
//
// Sample usage:
//
// // prepare the sampling chain at the start
// auto sparams = llama_sampler_chain_default_params();
//
// llama_sampler * smpl = llama_sampler_chain_init(sparams);
//
// llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
// llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
// llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
//
// // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
// // this sampler will be responsible to select the actual token
// llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
//
// ...
//
// // decoding loop:
// while (...) {
// ...
//
// llama_decode(ctx, batch);
//
// // sample from the logits of the last token in the batch
// const llama_token id = llama_sampler_sample(smpl, ctx, -1);
//
// // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
// llama_sampler_accept(smpl, id);
// ...
// }
//
// llama_sampler_free(smpl);
//
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
// TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
// //
typedef void * llama_sampler_context_t; LLAMA_API struct llama_grammar * llama_grammar_init(
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index);
// user code can implement the interface below in order to create custom llama_sampler LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
struct llama_sampler_i {
const char * (*name) (const struct llama_sampler * smpl); // can be NULL
void (*accept)( struct llama_sampler * smpl, llama_token token); // can be NULL
void (*apply) ( struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
void (*reset) ( struct llama_sampler * smpl); // can be NULL
struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
// TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
//void (*apply_ggml) (struct llama_sampler * smpl, ...);
};
struct llama_sampler { //
struct llama_sampler_i * iface; // Sampling functions
llama_sampler_context_t ctx; //
};
// mirror of llama_sampler_i: // Sets the current rng seed.
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
LLAMA_API void llama_sampler_reset ( struct llama_sampler * smpl);
LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
// important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
LLAMA_API void llama_sampler_free ( struct llama_sampler * smpl);
// llama_sampler_chain /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
// a type of llama_sampler that can chain multiple samplers one after another /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_repetition_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t penalty_last_n,
float penalty_repeat,
float penalty_freq,
float penalty_present);
LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params); /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// @param logits Logits extracted from the original generation context.
// important: takes ownership of the sampler object and will free it when llama_sampler_free is called /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl); /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i); LLAMA_API void llama_sample_apply_guidance(
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); struct llama_context * ctx,
float * logits,
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed float * logits_guidance,
LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i); float scale);
// available samplers:
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. LLAMA_API void llama_sample_softmax(
LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void); struct llama_context * ctx,
llama_token_data_array * candidates);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); LLAMA_API void llama_sample_top_k(
struct llama_context * ctx,
llama_token_data_array * candidates,
int32_t k,
size_t min_keep);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep); LLAMA_API void llama_sample_top_p(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep); LLAMA_API void llama_sample_min_p(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep); LLAMA_API void llama_sample_tail_free(
struct llama_context * ctx,
llama_token_data_array * candidates,
float z,
size_t min_keep);
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep); LLAMA_API void llama_sample_typical(
LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t); struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent); LLAMA_API void llama_sample_entropy(
struct llama_context * ctx,
llama_token_data_array * candidates_p,
float min_temp,
float max_temp,
float exponent_val);
LLAMA_API void llama_sample_temp(
struct llama_context * ctx,
llama_token_data_array * candidates,
float temp);
/// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@ -1099,62 +1026,42 @@ extern "C" {
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API struct llama_sampler * llama_sampler_init_mirostat( LLAMA_API llama_token llama_sample_token_mirostat(
int32_t n_vocab, struct llama_context * ctx,
uint32_t seed, llama_token_data_array * candidates,
float tau, float tau,
float eta, float eta,
int32_t m); int32_t m,
float * mu);
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2( LLAMA_API llama_token llama_sample_token_mirostat_v2(
uint32_t seed, struct llama_context * ctx,
llama_token_data_array * candidates,
float tau, float tau,
float eta); float eta,
float * mu);
LLAMA_API struct llama_sampler * llama_sampler_init_grammar( /// @details Selects the token with the highest probability.
const struct llama_model * model, /// Does not compute the token probabilities. Use llama_sample_softmax() instead.
const char * grammar_str, LLAMA_API llama_token llama_sample_token_greedy(
const char * grammar_root); struct llama_context * ctx,
llama_token_data_array * candidates);
LLAMA_API struct llama_sampler * llama_sampler_init_penalties( /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
int32_t n_vocab, // llama_n_vocab() LLAMA_API llama_token llama_sample_token(
llama_token special_eos_id, // llama_token_eos() struct llama_context * ctx,
llama_token linefeed_id, // llama_token_nl() llama_token_data_array * candidates);
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat, // 1.0 = disabled
float penalty_freq, // 0.0 = disabled
float penalty_present, // 0.0 = disabled
bool penalize_nl, // consider newlines as a repeatable token
bool ignore_eos); // ignore the end-of-sequence token
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( /// @details Accepts the sampled token into the grammar
int32_t n_vocab, LLAMA_API void llama_grammar_accept_token(
int32_t n_logit_bias, struct llama_context * ctx,
const llama_logit_bias * logit_bias); struct llama_grammar * grammar,
llama_token token);
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
/// @details Sample and accept a token from the idx-th output of the last evaluation
//
// Shorthand for:
// const auto * logits = llama_get_logits_ith(ctx, idx);
// llama_token_data_array cur_p = { ... init from logits ... };
// llama_sampler_apply(smpl, &cur_p);
// auto token = cur_p.data[cur_p.selected].id;
// llama_sampler_accept(smpl, token);
// return token;
// Returns the sampled token
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
// TODO: extend in the future
//LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
// //
// Model split // Model split
@ -1170,6 +1077,12 @@ extern "C" {
// Returns the split_prefix length. // Returns the split_prefix length.
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count); LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
// Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
LLAMA_API void llama_print_timings(struct llama_context * ctx);
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
// Print system information // Print system information
LLAMA_API const char * llama_print_system_info(void); LLAMA_API const char * llama_print_system_info(void);
@ -1177,41 +1090,58 @@ extern "C" {
// If this is not called, or NULL is supplied, everything is output on stderr. // If this is not called, or NULL is supplied, everything is output on stderr.
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
// LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
// Performance utils
//
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
//
struct llama_perf_context_data {
double t_start_ms;
double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;
int32_t n_p_eval;
int32_t n_eval;
};
struct llama_perf_sampler_data {
double t_sample_ms;
int32_t n_sample;
};
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
// NOTE: the following work only with samplers constructed via llama_sampler_chain_init
LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
#include <random>
#include <string>
#include <vector>
struct ggml_tensor;
struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};
struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
};
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
void llama_grammar_accept(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
const uint32_t chr,
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const std::string & src,
llama_partial_utf8 partial_start);
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
#endif // LLAMA_API_INTERNAL
#endif // LLAMA_H #endif // LLAMA_H

View File

@ -35,10 +35,10 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0); std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
if (n_tokens < 0) { if (n_tokens < 0) {
result.resize(-n_tokens); result.resize(-n_tokens);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false); int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
GGML_ASSERT(check == -n_tokens); GGML_ASSERT(check == -n_tokens);
} else { } else {
result.resize(n_tokens); result.resize(n_tokens);
@ -314,6 +314,7 @@ int main(int argc, char ** argv) {
// tune these to your liking // tune these to your liking
lcparams.n_ctx = 2048; lcparams.n_ctx = 2048;
lcparams.seed = 1;
lcparams.n_threads = params.n_threads; lcparams.n_threads = params.n_threads;
lcparams.flash_attn = params.flash_attn; lcparams.flash_attn = params.flash_attn;
@ -401,26 +402,6 @@ int main(int argc, char ** argv) {
llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1); llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1);
// init sampler
const float top_k = 5;
const float top_p = 0.80f;
const float temp = 0.30f;
const int seed = 0;
auto sparams = llama_sampler_chain_default_params();
llama_sampler * smpl = llama_sampler_chain_init(sparams);
if (temp > 0.0f) {
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(top_k));
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(top_p, 1));
llama_sampler_chain_add(smpl, llama_sampler_init_temp (temp));
llama_sampler_chain_add(smpl, llama_sampler_init_dist (seed));
} else {
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
}
// init session // init session
std::string path_session = params.path_session; std::string path_session = params.path_session;
std::vector<llama_token> session_tokens; std::vector<llama_token> session_tokens;
@ -436,7 +417,7 @@ int main(int argc, char ** argv) {
session_tokens.resize(llama_n_ctx(ctx_llama)); session_tokens.resize(llama_n_ctx(ctx_llama));
size_t n_token_count_out = 0; size_t n_token_count_out = 0;
if (!llama_state_load_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1; return 1;
} }
@ -719,13 +700,54 @@ int main(int argc, char ** argv) {
{ {
// out of user input, sample next token // out of user input, sample next token
const float top_k = 5;
const float top_p = 0.80f;
const float temp = 0.30f;
const float repeat_penalty = 1.1764f;
const int repeat_last_n = 256;
if (!path_session.empty() && need_to_save_session) { if (!path_session.empty() && need_to_save_session) {
need_to_save_session = false; need_to_save_session = false;
llama_state_save_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size()); llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
} }
const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1); llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
auto n_vocab = llama_n_vocab(model_llama);
logits[llama_token_eos(model_llama)] = 0;
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// apply repeat penalty
const float nl_logit = logits[llama_token_nl(model_llama)];
llama_sample_repetition_penalties(ctx_llama, &candidates_p,
embd_inp.data() + std::max(0, n_past - repeat_last_n),
repeat_last_n, repeat_penalty, 0.0, 0.0f);
logits[llama_token_nl(model_llama)] = nl_logit;
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
} else {
// Temperature sampling
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
llama_sample_temp (ctx_llama, &candidates_p, temp);
id = llama_sample_token(ctx_llama, &candidates_p);
}
}
if (id != llama_token_eos(model_llama)) { if (id != llama_token_eos(model_llama)) {
// add it to the context // add it to the context
@ -775,14 +797,8 @@ int main(int argc, char ** argv) {
whisper_print_timings(ctx_wsp); whisper_print_timings(ctx_wsp);
whisper_free(ctx_wsp); whisper_free(ctx_wsp);
llama_perf_sampler_print(smpl); llama_print_timings(ctx_llama);
llama_perf_context_print(ctx_llama);
llama_sampler_free(smpl);
llama_batch_free(batch);
llama_free(ctx_llama); llama_free(ctx_llama);
llama_backend_free();
return 0; return 0;
} }

View File

@ -7,7 +7,7 @@
#include <unordered_map> #include <unordered_map>
#include <unordered_set> #include <unordered_set>
const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1 const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1
{0x000000, 0x0080}, {0x000000, 0x0080},
{0x000020, 0x0008}, {0x000020, 0x0008},
{0x000021, 0x0020}, {0x000021, 0x0020},
@ -2311,8 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
0x003000, 0x003000,
}; };
// list is always in ascending order, to enable binary searh const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
{0x000041, 0x000061}, {0x000041, 0x000061},
{0x000042, 0x000062}, {0x000042, 0x000062},
{0x000043, 0x000063}, {0x000043, 0x000063},
@ -3748,8 +3747,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
{0x01E921, 0x01E943}, {0x01E921, 0x01E943},
}; };
// list is always in ascending order, to enable binary searh const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
{0x000061, 0x000041}, {0x000061, 0x000041},
{0x000062, 0x000042}, {0x000062, 0x000042},
{0x000063, 0x000043}, {0x000063, 0x000043},
@ -5202,7 +5200,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase
{0x01E943, 0x01E921}, {0x01E943, 0x01E921},
}; };
const std::initializer_list<range_nfd> unicode_ranges_nfd = { // start, last, nfd const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd
{0x000000, 0x000000, 0x000000}, {0x000000, 0x000000, 0x000000},
{0x0000C0, 0x0000C5, 0x000041}, {0x0000C0, 0x0000C5, 0x000041},
{0x0000C7, 0x0000C7, 0x000043}, {0x0000C7, 0x0000C7, 0x000043},
@ -7032,3 +7030,4 @@ const std::initializer_list<range_nfd> unicode_ranges_nfd = { // start, last, n
{0x02FA1C, 0x02FA1C, 0x009F3B}, {0x02FA1C, 0x02FA1C, 0x009F3B},
{0x02FA1D, 0x02FA1D, 0x02A600}, {0x02FA1D, 0x02FA1D, 0x02A600},
}; };

View File

@ -13,8 +13,8 @@ struct range_nfd {
static const uint32_t MAX_CODEPOINTS = 0x110000; static const uint32_t MAX_CODEPOINTS = 0x110000;
extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags; extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
extern const std::unordered_set<uint32_t> unicode_set_whitespace; extern const std::unordered_set<uint32_t> unicode_set_whitespace;
extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase; extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase; extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
extern const std::initializer_list<range_nfd> unicode_ranges_nfd; extern const std::vector<range_nfd> unicode_ranges_nfd;

View File

@ -1,11 +1,6 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "unicode.h" #include "unicode.h"
#include "unicode-data.h" #include "unicode-data.h"
#include <algorithm>
#include <cassert> #include <cassert>
#include <cstddef> #include <cstddef>
#include <cstdint> #include <cstdint>
@ -20,12 +15,6 @@
#include <locale> #include <locale>
#include <codecvt> #include <codecvt>
size_t unicode_len_utf8(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
return lookup[highbits];
}
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) { static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
std::string result; std::string result;
for (size_t i = 0; i < cps.size(); ++i) { for (size_t i = 0; i < cps.size(); ++i) {
@ -34,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
return result; return result;
} }
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) { static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
assert(offset < utf8.size()); assert(offset < utf8.size());
if (!(utf8[offset + 0] & 0x80)) { if (!(utf8[offset + 0] & 0x80)) {
auto result = utf8[offset + 0]; auto result = utf8[offset + 0];
@ -123,11 +112,11 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
static std::vector<codepoint_flags> unicode_cpt_flags_array() { static std::vector<codepoint_flags> unicode_cpt_flags_array() {
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED); std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
assert (unicode_ranges_flags.begin()[0].first == 0); assert (unicode_ranges_flags.front().first == 0);
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS); assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) { for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
const auto range_ini = unicode_ranges_flags.begin()[i-1]; // codepoint_ini, flags const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags
const auto range_end = unicode_ranges_flags.begin()[i]; // codepoint_end, flags const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags
for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) { for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
cpt_flags[cpt] = range_ini.second; cpt_flags[cpt] = range_ini.second;
} }
@ -243,7 +232,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
}; };
auto _get_flags = [&] (const size_t pos) -> codepoint_flags { auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{}; static const codepoint_flags undef(codepoint_flags::UNDEFINED);
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
}; };
size_t _prev_end = offset_ini; size_t _prev_end = offset_ini;
@ -305,9 +295,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
continue; continue;
} }
// regex: <space>?[^\s\p{L}\p{N}]+ // regex: <space>?[^\s\p{L}\p{N}]+
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) { if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
pos += (cpt == ' '); pos += (cpt == ' ');
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) { while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
flags2 = _get_flags(++pos); flags2 = _get_flags(++pos);
} }
_add_token(pos); _add_token(pos);
@ -361,7 +351,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
}; };
auto _get_flags = [&] (const size_t pos) -> codepoint_flags { auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{}; static const codepoint_flags undef(codepoint_flags::UNDEFINED);
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
}; };
size_t _prev_end = offset_ini; size_t _prev_end = offset_ini;
@ -403,8 +394,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
} }
} }
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) { if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
pos++; pos++;
while (_get_flags(pos).is_letter) { while (_get_flags(pos).is_letter) {
@ -430,9 +421,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]* // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags); auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) { if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
pos += (cpt == ' '); pos += (cpt == ' ');
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) { while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
flags2 = _get_flags(++pos); flags2 = _get_flags(++pos);
} }
uint32_t cpt2 = _get_cpt(pos); uint32_t cpt2 = _get_cpt(pos);
@ -597,7 +588,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
std::vector<uint32_t> result(cpts.size()); std::vector<uint32_t> result(cpts.size());
for (size_t i = 0; i < cpts.size(); ++i) { for (size_t i = 0; i < cpts.size(); ++i) {
const uint32_t cpt = cpts[i]; const uint32_t cpt = cpts[i];
auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1; auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt; result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
} }
return result; return result;
@ -639,15 +630,8 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
} }
uint32_t unicode_tolower(uint32_t cp) { uint32_t unicode_tolower(uint32_t cp) {
// binary search auto it = unicode_map_lowercase.find(cp);
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp, return it == unicode_map_lowercase.end() ? cp : it->second;
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
return pair.first < value;
});
if (it != unicode_map_lowercase.end() && it->first == cp) {
return it->second;
}
return cp; // Return the original code point if no lowercase mapping is found
} }
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) { std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {

View File

@ -4,8 +4,6 @@
#include <string> #include <string>
#include <vector> #include <vector>
// TODO: prefix all symbols with "llama_"
struct codepoint_flags { struct codepoint_flags {
enum { enum {
UNDEFINED = 0x0001, UNDEFINED = 0x0001,
@ -48,10 +46,8 @@ struct codepoint_flags {
} }
}; };
size_t unicode_len_utf8(char src);
std::string unicode_cpt_to_utf8(uint32_t cp); std::string unicode_cpt_to_utf8(uint32_t cp);
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8); std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts); std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);

View File

@ -21,7 +21,7 @@ help()
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]" echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
echo "options:" echo "options:"
echo "-s Step in seconds (default is $step)." echo "-s Step in seconds (default is $step)."
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' 'large-v3-turbo' (default is '$model')." echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
echo "-t Number of threads to use." echo "-t Number of threads to use."
echo "-h Print this help page." echo "-h Print this help page."
echo echo

View File

@ -7,9 +7,8 @@ set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
set(SOURCE_FILES set(SOURCE_FILES
${WHISPER_LIB_DIR}/ggml/src/ggml.c ${WHISPER_LIB_DIR}/ggml/src/ggml.c
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
${WHISPER_LIB_DIR}/src/whisper.cpp ${WHISPER_LIB_DIR}/src/whisper.cpp
${CMAKE_SOURCE_DIR}/jni.c ${CMAKE_SOURCE_DIR}/jni.c

View File

@ -19,9 +19,8 @@ if (NOT GGML_HOME)
SOURCE_FILES SOURCE_FILES
${SOURCE_FILES} ${SOURCE_FILES}
${WHISPER_LIB_DIR}/ggml/src/ggml.c ${WHISPER_LIB_DIR}/ggml/src/ggml.c
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c ${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp ${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c ${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
) )
endif() endif()

View File

@ -7,7 +7,6 @@
objects = { objects = {
/* Begin PBXBuildFile section */ /* Begin PBXBuildFile section */
18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18133C7F2C64E342005CEAAC /* ggml-aarch64.c */; };
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; }; 1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; };
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 1844471B2AB21655007D6BFE /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; }; 1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 1844471B2AB21655007D6BFE /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; };
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; }; 18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
@ -22,7 +21,7 @@
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; }; 18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; }; 18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; }; 18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; }; 18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; }; 18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; }; 7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; }; 7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
@ -45,8 +44,6 @@
/* End PBXCopyFilesBuildPhase section */ /* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */ /* Begin PBXFileReference section */
18133C7E2C64E342005CEAAC /* ggml-aarch64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-aarch64.h"; path = "../../../ggml/src/ggml-aarch64.h"; sourceTree = "<group>"; };
18133C7F2C64E342005CEAAC /* ggml-aarch64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-aarch64.c"; path = "../../../ggml/src/ggml-aarch64.c"; sourceTree = "<group>"; };
184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; }; 184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; }; 184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; }; 1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; };
@ -73,7 +70,7 @@
18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; }; 18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; }; 18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; }; 18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; }; 18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml/src/ggml-backend.c"; sourceTree = "<group>"; };
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; }; 18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; }; 18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; }; 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
@ -115,12 +112,10 @@
18627C7829052BDF00BD2A04 /* whisper.objc */ = { 18627C7829052BDF00BD2A04 /* whisper.objc */ = {
isa = PBXGroup; isa = PBXGroup;
children = ( children = (
18133C7F2C64E342005CEAAC /* ggml-aarch64.c */,
18133C7E2C64E342005CEAAC /* ggml-aarch64.h */,
18A275FF2C2A9563001C8D37 /* ggml-common.h */, 18A275FF2C2A9563001C8D37 /* ggml-common.h */,
18A275FE2C2A94DE001C8D37 /* ggml-metal.h */, 18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
18ABE1562AF556340044A204 /* ggml-backend-impl.h */, 18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
18ABE1572AF556340044A204 /* ggml-backend.cpp */, 18ABE1572AF556340044A204 /* ggml-backend.c */,
18ABE1552AF556340044A204 /* ggml-backend.h */, 18ABE1552AF556340044A204 /* ggml-backend.h */,
18ABE1582AF556340044A204 /* ggml-impl.h */, 18ABE1582AF556340044A204 /* ggml-impl.h */,
18ABE1592AF556340044A204 /* ggml-quants.c */, 18ABE1592AF556340044A204 /* ggml-quants.c */,
@ -241,14 +236,13 @@
files = ( files = (
18627C8129052BDF00BD2A04 /* ViewController.m in Sources */, 18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */, 18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */,
18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */,
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */, 7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */,
18627C9429052C4900BD2A04 /* whisper.cpp in Sources */, 18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
18627C9629052C5800BD2A04 /* ggml.c in Sources */, 18627C9629052C5800BD2A04 /* ggml.c in Sources */,
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */, 18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */, 7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */, 1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */, 18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */,
18627C8C29052BE000BD2A04 /* main.m in Sources */, 18627C8C29052BE000BD2A04 /* main.m in Sources */,
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */, 18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */, 1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,

View File

@ -50,24 +50,9 @@ else()
set(GGML_BLAS_VENDOR_DEFAULT "Generic") set(GGML_BLAS_VENDOR_DEFAULT "Generic")
endif() endif()
if (CMAKE_CROSSCOMPILING)
set(GGML_NATIVE_DEFAULT OFF)
else()
set(GGML_NATIVE_DEFAULT ON)
endif()
# defaults
if (NOT GGML_LLAMAFILE_DEFAULT)
set(GGML_LLAMAFILE_DEFAULT OFF)
endif()
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
endif()
# general # general
option(GGML_STATIC "ggml: static link libraries" OFF) option(GGML_STATIC "ggml: static link libraries" OFF)
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT}) option(GGML_NATIVE "ggml: enable -march=native flag" ON)
option(GGML_LTO "ggml: enable link time optimization" OFF) option(GGML_LTO "ggml: enable link time optimization" OFF)
option(GGML_CCACHE "ggml: use ccache if available" ON) option(GGML_CCACHE "ggml: use ccache if available" ON)
@ -85,7 +70,7 @@ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF) option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
# instruction set specific # instruction set specific
if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT) if (GGML_NATIVE)
set(INS_ENB OFF) set(INS_ENB OFF)
else() else()
set(INS_ENB ON) set(INS_ENB ON)
@ -119,13 +104,11 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT}) option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor") "ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT}) option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF)
option(GGML_CUDA "ggml: use CUDA" OFF) option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF) option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF) option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels") set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels") set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF) option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
@ -136,16 +119,14 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF) option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF) option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF) option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF) option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF) option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF) option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF) option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF) option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF) option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF) option(GGML_KOMPUTE "ggml: use Kompute" OFF)
@ -211,20 +192,13 @@ endif ()
include(GNUInstallDirs) include(GNUInstallDirs)
include(CMakePackageConfigHelpers) include(CMakePackageConfigHelpers)
# all public headers
set(GGML_PUBLIC_HEADERS set(GGML_PUBLIC_HEADERS
include/ggml.h include/ggml.h
include/ggml-alloc.h include/ggml-alloc.h
include/ggml-backend.h include/ggml-backend.h
include/ggml-blas.h "${GGML_HEADERS_CUDA}"
include/ggml-cann.h "${GGML_HEADERS_METAL}"
include/ggml-cuda.h "${GGML_HEADERS_EXTRA}")
include/ggml.h
include/ggml-kompute.h
include/ggml-metal.h
include/ggml-rpc.h
include/ggml-sycl.h
include/ggml-vulkan.h)
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}") set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
#if (GGML_METAL) #if (GGML_METAL)

View File

@ -24,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
// Graph allocator // Graph allocator
/* /*
Example usage: Example usage:
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
// optional: create a worst-case graph and reserve the buffers to avoid reallocations // optional: create a worst-case graph and reserve the buffers to avoid reallocations
ggml_gallocr_reserve(galloc, build_graph(max_batch)); ggml_gallocr_reserve(galloc, build_graph(max_batch));

View File

@ -12,52 +12,41 @@ extern "C" {
typedef struct ggml_backend_event * ggml_backend_event_t; typedef struct ggml_backend_event * ggml_backend_event_t;
typedef struct ggml_backend * ggml_backend_t; typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t; typedef void * ggml_backend_graph_plan_t;
typedef struct ggml_backend_reg * ggml_backend_reg_t;
typedef struct ggml_backend_device * ggml_backend_dev_t;
//
// Backend buffer type
//
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
// //
// Backend buffer // Backend buffer
// //
// buffer type
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer
enum ggml_backend_buffer_usage { enum ggml_backend_buffer_usage {
GGML_BACKEND_BUFFER_USAGE_ANY = 0, GGML_BACKEND_BUFFER_USAGE_ANY = 0,
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
}; };
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer); GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// //
// Backend (stream) // Backend
// //
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend); GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
@ -72,10 +61,8 @@ extern "C" {
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// "offset" refers to the offset of the tensor data for setting/getting data GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend); GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
@ -85,118 +72,64 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
// NOTE: will be removed, use device version instead
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft); GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// asynchronous copy // asynchronous copy
// the copy is performed after all the currently queued operations in backend_src // the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations // backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported // automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst); GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend); // events
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
// GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
// Events GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
//
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event); GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event); GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
// //
// Backend device // CPU backend
// //
enum ggml_backend_dev_type { GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_BACKEND_DEVICE_TYPE_CPU,
GGML_BACKEND_DEVICE_TYPE_GPU,
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
};
// functionality supported by the device GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
struct ggml_backend_dev_caps { GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
// asynchronous operations GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
bool async;
// pinned host buffer
bool host_buffer;
// event synchronization
bool events;
};
// all the device properties // Create a backend buffer from an existing pointer
struct ggml_backend_dev_props { GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
const char * name;
const char * description;
size_t memory_free;
size_t memory_total;
enum ggml_backend_dev_type type;
struct ggml_backend_dev_caps caps;
};
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op); #ifdef GGML_USE_CPU_HBM
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft); GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op); #endif
//
// Backend (reg)
//
GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
// Functions that may be obtained using ggml_backend_reg_get_proc_address
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
// //
// Backend registry // Backend registry
// //
// Backend (reg) enumeration // The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
GGML_API size_t ggml_backend_reg_count(void);
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
// Device enumeration GGML_API size_t ggml_backend_reg_get_count(void);
GGML_API size_t ggml_backend_dev_count(void); GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index); GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name); GGML_API const char * ggml_backend_reg_get_name(size_t i);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type); GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
// Direct backend (stream) initialization GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);
// //
// Backend scheduler // Backend scheduler
// //
// The backend scheduler allows for multiple backend devices to be used together // The backend scheduler allows for multiple backends to be used together
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends // Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
// The backends are selected based on: // The backends are selected based on:
// - the backend that supports the operation // - the backend that supports the operation
@ -231,9 +164,9 @@ extern "C" {
} }
*/ */
struct ggml_backend_sched;
typedef struct ggml_backend_sched * ggml_backend_sched_t; typedef struct ggml_backend_sched * ggml_backend_sched_t;
// Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
// when ask == true, the scheduler wants to know if the user wants to observe this node // when ask == true, the scheduler wants to know if the user wants to observe this node
// this allows the scheduler to batch nodes together in order to evaluate them in a single call // this allows the scheduler to batch nodes together in order to evaluate them in a single call
// //
@ -247,7 +180,7 @@ extern "C" {
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph // Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched); GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i); GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
@ -262,7 +195,7 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler // Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph); GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched); GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
@ -288,7 +221,7 @@ extern "C" {
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph); GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy); GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends // Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
@ -297,26 +230,6 @@ extern "C" {
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor); GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
//
// CPU backend
//
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -9,13 +9,13 @@ extern "C" {
#endif #endif
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_blas_init(void); GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend); GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
// number of threads used for conversion to float // number of threads used for conversion to float
// for openblas and blis, this will also set the number of threads used for blas operations // for openblas and blis, this will also set the number of threads used for blas operations
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads); GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -1,121 +0,0 @@
/*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#pragma once
#include "ggml-backend.h"
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Maximum number of CANN devices supported.
*/
#define GGML_CANN_MAX_DEVICES 16
/**
* @brief Initializes the CANN backend for a specified device.
*
* This function initializes the CANN backend for the given device.
* It verifies the device index, allocates a context, and creates a backend
* instance.
*
* @param device The index of the device to initialize.
* @return A pointer to the initialized backend instance, or nullptr on failure.
*/
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
/**
* @brief Checks if a given backend is a CANN backend.
*
* This function verifies if the provided backend is a CANN backend by comparing
* its GUID with the CANN backend's GUID.
*
* @param backend The backend instance to check.
* @return True if the backend is a CANN backend, false otherwise.
*/
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
/**
* @brief Retrieves the CANN buffer type for a specified device.
*
* This function initializes and returns the buffer type interface associated
* with the given device. It ensures thread-safe access using a mutex.
*
* @param device The device index for which to retrieve the buffer type.
* @return A pointer to the buffer type interface for the specified device, or
* nullptr if the device index is out of range.
*/
GGML_API ggml_backend_buffer_type_t
ggml_backend_cann_buffer_type(int32_t device);
/**
* @brief Retrieves the number of CANN devices available.
*
* This function returns the number of CANN devices available based on
* information obtained from `ggml_cann_info()`.
*
* @return The number of CANN devices available.
*/
GGML_API int32_t ggml_backend_cann_get_device_count(void);
/**
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
*
* @return A pointer to the host buffer type interface.
*/
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
/**
* @brief Retrieves the description of a specific CANN device.
*
* This function sets the specified device, retrieves the SoC name,
* and writes it into the provided description buffer.
*
* @param device The device index to retrieve the description for.
* @param description Pointer to a buffer where the description will be written.
* @param description_size Size of the description buffer.
*/
GGML_API void ggml_backend_cann_get_device_description(
int32_t device, char* description, size_t description_size);
/**
* @brief Retrieves the memory information of a specific CANN device.
*
* This function sets the specified device, retrieves the free and total
* memory information of the specified type (ACL_HBM_MEM), and stores them
* in the provided pointers.
*
* @param device The device index to retrieve memory information for.
* @param free Pointer to a variable where the free memory size will be stored.
* @param total Pointer to a variable where the total memory size will be
* stored.
*/
GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
size_t* free,
size_t* total);
#ifdef __cplusplus
}
#endif

View File

@ -3,45 +3,42 @@
#include "ggml.h" #include "ggml.h"
#include "ggml-backend.h" #include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef GGML_USE_HIPBLAS #ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm" #define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS" #define GGML_CUBLAS_NAME "hipBLAS"
#elif defined(GGML_USE_MUSA)
#define GGML_CUDA_NAME "MUSA"
#define GGML_CUBLAS_NAME "muBLAS"
#else #else
#define GGML_CUDA_NAME "CUDA" #define GGML_CUDA_NAME "CUDA"
#define GGML_CUBLAS_NAME "cuBLAS" #define GGML_CUBLAS_NAME "cuBLAS"
#endif #endif
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_CUDA_MAX_DEVICES 16 #define GGML_CUDA_MAX_DEVICES 16
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_cuda_init(int device); GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer // device buffer
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices // split tensor buffer that splits matrices by rows across multiple devices
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API int ggml_backend_cuda_get_device_count(void); GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer); GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -1,5 +1,3 @@
// Note: this description is outdated
//
// An interface allowing to compute ggml_cgraph with Metal // An interface allowing to compute ggml_cgraph with Metal
// //
// This is a fully functional interface that extends ggml with GPU support for Apple devices. // This is a fully functional interface that extends ggml with GPU support for Apple devices.
@ -27,6 +25,9 @@
#include <stddef.h> #include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 64
struct ggml_tensor; struct ggml_tensor;
struct ggml_cgraph; struct ggml_cgraph;
@ -39,15 +40,17 @@ extern "C" {
// user-code should use only these functions // user-code should use only these functions
// //
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
GGML_API ggml_backend_t ggml_backend_metal_init(void); GGML_API ggml_backend_t ggml_backend_metal_init(void);
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend); GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size); GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data); GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family // helper to check if the device supports a specific family
// ideally, the user code should be doing these checks // ideally, the user code should be doing these checks
@ -60,3 +63,4 @@ GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -10,14 +10,14 @@ extern "C" {
#define GGML_RPC_MAX_SERVERS 16 #define GGML_RPC_MAX_SERVERS 16
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint); GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend); GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total); GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem); GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -23,20 +23,20 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device); GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices // split tensor buffer that splits matrices by rows across multiple devices
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len); GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size); GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API int ggml_backend_sycl_get_device_count(); GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
// SYCL doesn't support registering host memory, keep here for reference // SYCL doesn't support registering host memory, keep here for reference
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size); // GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer); // GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif

View File

@ -13,16 +13,16 @@ extern "C" {
GGML_API void ggml_vk_instance_init(void); GGML_API void ggml_vk_instance_init(void);
// backend API // backend API
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num); GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend); GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API int ggml_backend_vk_get_device_count(void); GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size); GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total); GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void); GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -187,6 +187,16 @@
# define GGML_API # define GGML_API
#endif #endif
#ifdef GGML_MULTIPLATFORM
# if defined(_WIN32)
# define GGML_CALL
# else
# define GGML_CALL __attribute__((__ms_abi__))
# endif
#else
# define GGML_CALL
#endif
// TODO: support for clang // TODO: support for clang
#ifdef __GNUC__ #ifdef __GNUC__
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint))) # define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@ -210,24 +220,21 @@
#include <stdio.h> #include <stdio.h>
#define GGML_FILE_MAGIC 0x67676d6c // "ggml" #define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 2 #define GGML_FILE_VERSION 1
#define GGML_QNT_VERSION 2 // bump this on quantization format changes #define GGML_QNT_VERSION 2 // bump this on quantization format changes
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this #define GGML_QNT_VERSION_FACTOR 1000 // do not change this
#define GGML_MAX_DIMS 4 #define GGML_MAX_DIMS 4
#define GGML_MAX_PARAMS 2048 #define GGML_MAX_PARAMS 2048
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 10 #define GGML_MAX_SRC 10
#define GGML_MAX_N_THREADS 512
#define GGML_MAX_OP_PARAMS 64
#ifndef GGML_MAX_NAME #ifndef GGML_MAX_NAME
# define GGML_MAX_NAME 64 #define GGML_MAX_NAME 64
#endif #endif
#define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
#define GGML_DEFAULT_GRAPH_SIZE 2048 #define GGML_DEFAULT_GRAPH_SIZE 2048
#if UINTPTR_MAX == 0xFFFFFFFF #if UINTPTR_MAX == 0xFFFFFFFF
#define GGML_MEM_ALIGN 4 #define GGML_MEM_ALIGN 4
#else #else
@ -237,8 +244,6 @@
#define GGML_EXIT_SUCCESS 0 #define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1 #define GGML_EXIT_ABORTED 1
#define GGML_ROPE_TYPE_NEOX 2
#define GGUF_MAGIC "GGUF" #define GGUF_MAGIC "GGUF"
#define GGUF_VERSION 3 #define GGUF_VERSION 3
@ -249,27 +254,26 @@
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1)) #define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
fflush(stdout); \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
ggml_print_backtrace(); \
abort(); \
} \
} while (0)
#ifndef NDEBUG #ifndef NDEBUG
# define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0) #define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
#elif defined(__GNUC__) #elif defined(__GNUC__)
# define GGML_UNREACHABLE() __builtin_unreachable() #define GGML_UNREACHABLE() __builtin_unreachable()
#elif defined(_MSC_VER) #elif defined(_MSC_VER)
# define GGML_UNREACHABLE() __assume(0) #define GGML_UNREACHABLE() __assume(0)
#else #else
# define GGML_UNREACHABLE() ((void) 0) #define GGML_UNREACHABLE() ((void) 0)
#endif #endif
#ifdef __cplusplus
# define GGML_NORETURN [[noreturn]]
#elif defined(_MSC_VER)
# define GGML_NORETURN __declspec(noreturn)
#else
# define GGML_NORETURN _Noreturn
#endif
#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
// used to copy the number of elements and stride in bytes of tensors into local variables. // used to copy the number of elements and stride in bytes of tensors into local variables.
// main purpose is to reduce code duplication and improve readability. // main purpose is to reduce code duplication and improve readability.
// //
@ -318,9 +322,6 @@
extern "C" { extern "C" {
#endif #endif
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
enum ggml_status { enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2, GGML_STATUS_ALLOC_FAILED = -2,
GGML_STATUS_FAILED = -1, GGML_STATUS_FAILED = -1,
@ -329,7 +330,7 @@ extern "C" {
}; };
// get ggml_status name string // get ggml_status name string
GGML_API const char * ggml_status_to_string(enum ggml_status status); GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
// ieee 754-2008 half-precision float16 // ieee 754-2008 half-precision float16
// todo: make this not an integral type // todo: make this not an integral type
@ -344,12 +345,10 @@ extern "C" {
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float); GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16 GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t); GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t); GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
struct ggml_object; struct ggml_object;
struct ggml_context; struct ggml_context;
struct ggml_cgraph;
// NOTE: always add types at the end of the enum to keep backward compatibility // NOTE: always add types at the end of the enum to keep backward compatibility
enum ggml_type { enum ggml_type {
@ -384,11 +383,6 @@ extern "C" {
GGML_TYPE_F64 = 28, GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29, GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30, GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_COUNT, GGML_TYPE_COUNT,
}; };
@ -430,9 +424,6 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
}; };
// available tensor operations: // available tensor operations:
@ -449,13 +440,10 @@ extern "C" {
GGML_OP_SQR, GGML_OP_SQR,
GGML_OP_SQRT, GGML_OP_SQRT,
GGML_OP_LOG, GGML_OP_LOG,
GGML_OP_SIN,
GGML_OP_COS,
GGML_OP_SUM, GGML_OP_SUM,
GGML_OP_SUM_ROWS, GGML_OP_SUM_ROWS,
GGML_OP_MEAN, GGML_OP_MEAN,
GGML_OP_ARGMAX, GGML_OP_ARGMAX,
GGML_OP_COUNT_EQUAL,
GGML_OP_REPEAT, GGML_OP_REPEAT,
GGML_OP_REPEAT_BACK, GGML_OP_REPEAT_BACK,
GGML_OP_CONCAT, GGML_OP_CONCAT,
@ -489,11 +477,9 @@ extern "C" {
GGML_OP_CLAMP, GGML_OP_CLAMP,
GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL, GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D, GGML_OP_POOL_1D,
GGML_OP_POOL_2D, GGML_OP_POOL_2D,
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD, GGML_OP_PAD,
GGML_OP_ARANGE, GGML_OP_ARANGE,
@ -509,7 +495,6 @@ extern "C" {
GGML_OP_WIN_UNPART, GGML_OP_WIN_UNPART,
GGML_OP_GET_REL_POS, GGML_OP_GET_REL_POS,
GGML_OP_ADD_REL_POS, GGML_OP_ADD_REL_POS,
GGML_OP_RWKV_WKV,
GGML_OP_UNARY, GGML_OP_UNARY,
@ -526,7 +511,6 @@ extern "C" {
GGML_OP_CROSS_ENTROPY_LOSS, GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK, GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_OPT_STEP_ADAMW,
GGML_OP_COUNT, GGML_OP_COUNT,
}; };
@ -545,7 +529,6 @@ extern "C" {
GGML_UNARY_OP_SILU, GGML_UNARY_OP_SILU,
GGML_UNARY_OP_HARDSWISH, GGML_UNARY_OP_HARDSWISH,
GGML_UNARY_OP_HARDSIGMOID, GGML_UNARY_OP_HARDSIGMOID,
GGML_UNARY_OP_EXP,
GGML_UNARY_OP_COUNT, GGML_UNARY_OP_COUNT,
}; };
@ -557,22 +540,32 @@ extern "C" {
}; };
enum ggml_log_level { enum ggml_log_level {
GGML_LOG_LEVEL_NONE = 0, GGML_LOG_LEVEL_ERROR = 2,
GGML_LOG_LEVEL_INFO = 1, GGML_LOG_LEVEL_WARN = 3,
GGML_LOG_LEVEL_WARN = 2, GGML_LOG_LEVEL_INFO = 4,
GGML_LOG_LEVEL_ERROR = 3, GGML_LOG_LEVEL_DEBUG = 5
GGML_LOG_LEVEL_DEBUG = 4,
GGML_LOG_LEVEL_CONT = 5, // continue previous log
}; };
// this tensor...
enum ggml_tensor_flag { enum ggml_tensor_flag {
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph GGML_TENSOR_FLAG_INPUT = 1,
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph GGML_TENSOR_FLAG_OUTPUT = 2,
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters GGML_TENSOR_FLAG_PARAM = 4,
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
}; };
// ggml object
struct ggml_object {
size_t offs;
size_t size;
struct ggml_object * next;
enum ggml_object_type type;
char padding[4];
};
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
// n-dimensional tensor // n-dimensional tensor
struct ggml_tensor { struct ggml_tensor {
enum ggml_type type; enum ggml_type type;
@ -618,29 +611,6 @@ extern "C" {
// If it returns true, the computation is aborted // If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data); typedef bool (*ggml_abort_callback)(void * data);
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute() // the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287 // since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan { struct ggml_cplan {
@ -648,15 +618,39 @@ extern "C" {
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()` uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads; int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true // abort ggml_graph_compute when true
ggml_abort_callback abort_callback; ggml_abort_callback abort_callback;
void * abort_callback_data; void * abort_callback_data;
}; };
enum ggml_cgraph_eval_order {
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
GGML_CGRAPH_EVAL_ORDER_COUNT
};
struct ggml_hash_set {
size_t size;
struct ggml_tensor ** keys;
};
// computation graph
struct ggml_cgraph {
int size;
int n_nodes;
int n_leafs;
struct ggml_tensor ** nodes;
struct ggml_tensor ** grads;
struct ggml_tensor ** leafs;
struct ggml_hash_set visited_hash_table;
enum ggml_cgraph_eval_order order;
};
// scratch buffer // scratch buffer
// TODO: deprecate and remove
struct ggml_scratch { struct ggml_scratch {
size_t offs; size_t offs;
size_t size; size_t size;
@ -698,6 +692,8 @@ extern "C" {
GGML_API int64_t ggml_cycles(void); GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void); GGML_API int64_t ggml_cycles_per_ms(void);
GGML_API void ggml_print_backtrace(void);
// accepts a UTF-8 path, even on Windows // accepts a UTF-8 path, even on Windows
GGML_API FILE * ggml_fopen(const char * fname, const char * mode); GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
@ -707,52 +703,50 @@ extern "C" {
GGML_API void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_print_object (const struct ggml_object * obj);
GGML_API void ggml_print_objects(const struct ggml_context * ctx); GGML_API void ggml_print_objects(const struct ggml_context * ctx);
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor); GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor); GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor); GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
GGML_API int64_t ggml_blck_size(enum ggml_type type); GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
GGML_DEPRECATED( GGML_DEPRECATED(
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
"use ggml_row_size() instead"); "use ggml_row_size() instead");
GGML_API const char * ggml_type_name(enum ggml_type type); GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op); GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
GGML_API const char * ggml_op_symbol(enum ggml_op op); GGML_API const char * ggml_op_symbol(enum ggml_op op);
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op); GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor); GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_quantized(enum ggml_type type); GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
// TODO: temporary until model loading of ggml examples is refactored // TODO: temporary until model loading of ggml examples is refactored
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor); GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor); GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor); GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor); GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor); GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor); GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor); GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor); GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous() GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1 GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2 GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1); GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
// use this to compute the memory overhead of a tensor // use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void); GGML_API size_t ggml_tensor_overhead(void);
@ -760,9 +754,8 @@ extern "C" {
// main // main
GGML_API struct ggml_context * ggml_init (struct ggml_init_params params); GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
GGML_API void ggml_reset(struct ggml_context * ctx); GGML_API void ggml_free(struct ggml_context * ctx);
GGML_API void ggml_free (struct ggml_context * ctx);
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
@ -839,7 +832,7 @@ extern "C" {
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor); GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor); GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name); GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
@ -960,22 +953,6 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sin(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sin_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_cos(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_cos_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return scalar // return scalar
GGML_API struct ggml_tensor * ggml_sum( GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -996,12 +973,6 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// count number of equal elements in a and b
GGML_API struct ggml_tensor * ggml_count_equal(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// if a is the same shape as b, and a is not parameter, return a // if a is the same shape as b, and a is not parameter, return a
// otherwise, return a new tensor: repeat(a) to fit in b // otherwise, return a new tensor: repeat(a) to fit in b
GGML_API struct ggml_tensor * ggml_repeat( GGML_API struct ggml_tensor * ggml_repeat(
@ -1132,14 +1103,6 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_exp(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_exp_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows // normalize along rows
GGML_API struct ggml_tensor * ggml_norm( GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1163,17 +1126,16 @@ extern "C" {
// group normalize along ne0*ne1*n_groups // group normalize along ne0*ne1*n_groups
// used in stable-diffusion // used in stable-diffusion
// TODO: eps is hardcoded to 1e-6 for now
GGML_API struct ggml_tensor * ggml_group_norm( GGML_API struct ggml_tensor * ggml_group_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_groups, int n_groups);
float eps);
GGML_API struct ggml_tensor * ggml_group_norm_inplace( GGML_API struct ggml_tensor * ggml_group_norm_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_groups, int n_groups);
float eps);
// a - x // a - x
// b - dy // b - dy
@ -1235,7 +1197,7 @@ extern "C" {
size_t nb1, size_t nb1,
size_t nb2, size_t nb2,
size_t nb3, size_t nb3,
size_t offset); // in bytes size_t offset);
// b -> view(a,offset,nb1,nb2,3), return view(a) // b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_inplace( GGML_API struct ggml_tensor * ggml_set_inplace(
@ -1245,19 +1207,19 @@ extern "C" {
size_t nb1, size_t nb1,
size_t nb2, size_t nb2,
size_t nb3, size_t nb3,
size_t offset); // in bytes size_t offset);
GGML_API struct ggml_tensor * ggml_set_1d( GGML_API struct ggml_tensor * ggml_set_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
size_t offset); // in bytes size_t offset);
GGML_API struct ggml_tensor * ggml_set_1d_inplace( GGML_API struct ggml_tensor * ggml_set_1d_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
size_t offset); // in bytes size_t offset);
// b -> view(a,offset,nb1,nb2,3), return modified a // b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set_2d( GGML_API struct ggml_tensor * ggml_set_2d(
@ -1265,7 +1227,7 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
size_t nb1, size_t nb1,
size_t offset); // in bytes size_t offset);
// b -> view(a,offset,nb1,nb2,3), return view(a) // b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_2d_inplace( GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@ -1273,7 +1235,7 @@ extern "C" {
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
size_t nb1, size_t nb1,
size_t offset); // in bytes size_t offset);
// a -> b, return view(b) // a -> b, return view(b)
GGML_API struct ggml_tensor * ggml_cpy( GGML_API struct ggml_tensor * ggml_cpy(
@ -1408,14 +1370,14 @@ extern "C" {
// supports 3D: a->ne[2] == b->ne[1] // supports 3D: a->ne[2] == b->ne[1]
GGML_API struct ggml_tensor * ggml_get_rows( GGML_API struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // data struct ggml_tensor * a,
struct ggml_tensor * b); // row indices struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_get_rows_back( GGML_API struct ggml_tensor * ggml_get_rows_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // gradients of ggml_get_rows result struct ggml_tensor * a,
struct ggml_tensor * b, // row indices struct ggml_tensor * b,
struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape struct ggml_tensor * c);
GGML_API struct ggml_tensor * ggml_diag( GGML_API struct ggml_tensor * ggml_diag(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1476,10 +1438,11 @@ extern "C" {
struct ggml_tensor * b); struct ggml_tensor * b);
// rotary position embedding // rotary position embedding
// if (mode & 1) - skip n_past elements (NOT SUPPORTED) // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
// if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style // if mode & 2 == 1, GPT-NeoX style
// //
// b is an int32 vector with size a->ne[2], it contains the positions // b is an int32 vector with size a->ne[2], it contains the positions
// c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope( GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1496,7 +1459,6 @@ extern "C" {
int mode); int mode);
// custom RoPE // custom RoPE
// c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope_ext( GGML_API struct ggml_tensor * ggml_rope_ext(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1559,16 +1521,16 @@ extern "C" {
"use ggml_rope_ext_inplace instead"); "use ggml_rope_ext_inplace instead");
// compute correction dims for YaRN RoPE scaling // compute correction dims for YaRN RoPE scaling
void ggml_rope_yarn_corr_dims( GGML_CALL void ggml_rope_yarn_corr_dims(
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]); int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
// rotary position embedding backward, i.e compute dx from dy // rotary position embedding backward, i.e compute dx from dy
// a - dy // a - dy
GGML_API struct ggml_tensor * ggml_rope_back( GGML_API struct ggml_tensor * ggml_rope_back(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // gradients of ggml_rope result struct ggml_tensor * a,
struct ggml_tensor * b, // positions struct ggml_tensor * b,
struct ggml_tensor * c, // freq factors struct ggml_tensor * c,
int n_dims, int n_dims,
int mode, int mode,
int n_ctx_orig, int n_ctx_orig,
@ -1587,49 +1549,34 @@ extern "C" {
float min, float min,
float max); float max);
// im2col
// converts data into a format that effectively results in a convolution when combined with matrix multiplication
GGML_API struct ggml_tensor * ggml_im2col( GGML_API struct ggml_tensor * ggml_im2col(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel struct ggml_tensor * a,
struct ggml_tensor * b, // data struct ggml_tensor * b,
int s0, // stride dimension 0 int s0,
int s1, // stride dimension 1 int s1,
int p0, // padding dimension 0 int p0,
int p1, // padding dimension 1 int p1,
int d0, // dilation dimension 0 int d0,
int d1, // dilation dimension 1 int d1,
bool is_2D, bool is_2D,
enum ggml_type dst_type); enum ggml_type dst_type);
GGML_API struct ggml_tensor * ggml_im2col_back(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // gradient of im2col output
int64_t * ne, // shape of im2col input
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1, // dilation dimension 1
bool is_2D);
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d( GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel struct ggml_tensor * a,
struct ggml_tensor * b, // data struct ggml_tensor * b,
int s0, // stride dimension 0 int s0,
int s1, // stride dimension 1 int s1,
int p0, // padding dimension 0 int p0,
int p1, // padding dimension 1 int p1,
int d0, // dilation dimension 0 int d0,
int d1); // dilation dimension 1 int d1);
GGML_API struct ggml_tensor * ggml_conv_1d( GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel struct ggml_tensor * a,
struct ggml_tensor * b, // data struct ggml_tensor * b,
int s0, // stride int s0, // stride
int p0, // padding int p0, // padding
int d0); // dilation int d0); // dilation
@ -1638,29 +1585,29 @@ extern "C" {
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d) // alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
GGML_API struct ggml_tensor* ggml_conv_1d_ph( GGML_API struct ggml_tensor* ggml_conv_1d_ph(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel struct ggml_tensor * a,
struct ggml_tensor * b, // data struct ggml_tensor * b,
int s, // stride int s,
int d); // dilation int d);
GGML_API struct ggml_tensor * ggml_conv_transpose_1d( GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel struct ggml_tensor * a,
struct ggml_tensor * b, // data struct ggml_tensor * b,
int s0, // stride int s0,
int p0, // padding int p0,
int d0); // dilation int d0);
GGML_API struct ggml_tensor * ggml_conv_2d( GGML_API struct ggml_tensor * ggml_conv_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel struct ggml_tensor * a,
struct ggml_tensor * b, // data struct ggml_tensor * b,
int s0, // stride dimension 0 int s0,
int s1, // stride dimension 1 int s1,
int p0, // padding dimension 0 int p0,
int p1, // padding dimension 1 int p1,
int d0, // dilation dimension 0 int d0,
int d1); // dilation dimension 1 int d1);
// kernel size is a->ne[0] x a->ne[1] // kernel size is a->ne[0] x a->ne[1]
@ -1722,18 +1669,6 @@ extern "C" {
float p0, float p0,
float p1); float p1);
GGML_API struct ggml_tensor * ggml_pool_2d_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * af, // "a"/input used in forward pass
enum ggml_op_pool op,
int k0,
int k1,
int s0,
int s1,
float p0,
float p1);
// nearest interpolate // nearest interpolate
// multiplies ne0 and ne1 by scale factor // multiplies ne0 and ne1 by scale factor
// used in stable-diffusion // used in stable-diffusion
@ -1808,8 +1743,7 @@ extern "C" {
struct ggml_tensor * v, struct ggml_tensor * v,
struct ggml_tensor * mask, struct ggml_tensor * mask,
float scale, float scale,
float max_bias, float max_bias);
float logit_softcap);
GGML_API void ggml_flash_attn_ext_set_prec( GGML_API void ggml_flash_attn_ext_set_prec(
struct ggml_tensor * a, struct ggml_tensor * a,
@ -1826,8 +1760,10 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_ssm_conv( GGML_API struct ggml_tensor * ggml_ssm_conv(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * sx, struct ggml_tensor * s,
struct ggml_tensor * c); struct ggml_tensor * x,
struct ggml_tensor * c,
struct ggml_tensor * sq);
GGML_API struct ggml_tensor * ggml_ssm_scan( GGML_API struct ggml_tensor * ggml_ssm_scan(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -1836,7 +1772,8 @@ extern "C" {
struct ggml_tensor * dt, struct ggml_tensor * dt,
struct ggml_tensor * A, struct ggml_tensor * A,
struct ggml_tensor * B, struct ggml_tensor * B,
struct ggml_tensor * C); struct ggml_tensor * C,
struct ggml_tensor * sq);
// partition into non-overlapping windows with padding if needed // partition into non-overlapping windows with padding if needed
// example: // example:
@ -1888,15 +1825,6 @@ extern "C" {
struct ggml_tensor * pw, struct ggml_tensor * pw,
struct ggml_tensor * ph); struct ggml_tensor * ph);
GGML_API struct ggml_tensor * ggml_rwkv_wkv(
struct ggml_context * ctx,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * r,
struct ggml_tensor * tf,
struct ggml_tensor * td,
struct ggml_tensor * state);
// custom operators // custom operators
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *); typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@ -1980,8 +1908,7 @@ extern "C" {
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata); typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata); typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
#define GGML_N_TASKS_MAX (-1) #define GGML_N_TASKS_MAX -1
// n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
GGML_API struct ggml_tensor * ggml_map_custom1( GGML_API struct ggml_tensor * ggml_map_custom1(
struct ggml_context * ctx, struct ggml_context * ctx,
@ -2035,83 +1962,43 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_cross_entropy_loss( GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, // logits struct ggml_tensor * a,
struct ggml_tensor * b); // labels struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back( GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
struct ggml_context * ctx,
struct ggml_tensor * a, // logits
struct ggml_tensor * b, // labels
struct ggml_tensor * c); // gradients of cross_entropy_loss result
// AdamW optimizer step
// Paper: https://arxiv.org/pdf/1711.05101v3.pdf
// PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
GGML_API struct ggml_tensor * ggml_opt_step_adamw(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * grad, struct ggml_tensor * b,
float alpha, struct ggml_tensor * c);
float beta1,
float beta2,
float eps,
float wd); // weight decay
// //
// automatic differentiation // automatic differentiation
// //
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor); GGML_API void ggml_set_param(
GGML_API void ggml_set_loss(struct ggml_tensor * tensor); struct ggml_context * ctx,
struct ggml_tensor * tensor);
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate); GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
GGML_API void ggml_build_opt_adamw(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
float alpha,
float beta1,
float beta2,
float eps,
float wd); // weight decay
// graph allocation in a context // graph allocation in a context
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads); GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst); GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1 GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph); GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API size_t ggml_graph_overhead(void); GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads); GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute() // ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data // when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan( GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
const struct ggml_cgraph * cgraph, GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context // same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads); GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
@ -2175,10 +2062,6 @@ extern "C" {
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel); typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data); typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
// optimization parameters // optimization parameters
// //
// see ggml.c (ggml_opt_default_params) for default values // see ggml.c (ggml_opt_default_params) for default values
@ -2504,16 +2387,10 @@ extern "C" {
GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void); GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_riscv_v (void);
GGML_API int ggml_cpu_has_sycl (void); GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_rpc (void); GGML_API int ggml_cpu_has_rpc (void);
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_matmul_int8(void); GGML_API int ggml_cpu_has_matmul_int8(void);
GGML_API int ggml_cpu_has_cann (void);
GGML_API int ggml_cpu_has_llamafile (void);
// get the sve vector length in bytes
GGML_API int ggml_cpu_get_sve_cnt(void);
// //
// Internal types and functions exposed for tests and benchmarks // Internal types and functions exposed for tests and benchmarks
@ -2527,31 +2404,20 @@ extern "C" {
#endif #endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k); typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k); typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx, typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc); const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef struct { typedef struct {
const char * type_name; const char * type_name;
int64_t blck_size; int blck_size;
int64_t blck_size_interleave; // interleave elements in blocks
size_t type_size; size_t type_size;
bool is_quantized; bool is_quantized;
ggml_to_float_t to_float; ggml_to_float_t to_float;
ggml_from_float_t from_float; ggml_from_float_t from_float;
ggml_from_float_t from_float_ref; ggml_from_float_t from_float_reference;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot; ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type; enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously int64_t nrows; // number of rows to process simultaneously;
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
} ggml_type_traits_t; } ggml_type_traits_t;
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type); GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

View File

@ -26,9 +26,6 @@ if (NOT MSVC)
endif() endif()
endif() endif()
unset(GGML_EXTRA_LIBS_PRIVATE)
unset(GGML_EXTRA_LIBS_PUBLIC)
if (APPLE AND GGML_ACCELERATE) if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate) find_library(ACCELERATE_FRAMEWORK Accelerate)
if (ACCELERATE_FRAMEWORK) if (ACCELERATE_FRAMEWORK)
@ -38,7 +35,7 @@ if (APPLE AND GGML_ACCELERATE)
add_compile_definitions(ACCELERATE_NEW_LAPACK) add_compile_definitions(ACCELERATE_NEW_LAPACK)
add_compile_definitions(ACCELERATE_LAPACK_ILP64) add_compile_definitions(ACCELERATE_LAPACK_ILP64)
list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK}) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
else() else()
message(WARNING "Accelerate framework not found") message(WARNING "Accelerate framework not found")
endif() endif()
@ -90,7 +87,7 @@ if (GGML_METAL)
COMMENT "Generate assembly for embedded Metal library" COMMENT "Generate assembly for embedded Metal library"
) )
list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM}) set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
else() else()
if (GGML_METAL_SHADER_DEBUG) if (GGML_METAL_SHADER_DEBUG)
# custom command to do the following: # custom command to do the following:
@ -135,24 +132,13 @@ if (GGML_METAL)
) )
endif() # GGML_METAL_EMBED_LIBRARY endif() # GGML_METAL_EMBED_LIBRARY
list(APPEND GGML_EXTRA_LIBS_PRIVATE set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
${FOUNDATION_LIBRARY} ${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK} ${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK} ${METALKIT_FRAMEWORK}
) )
endif() endif()
if (GGML_MUSA)
set(CMAKE_C_COMPILER clang)
set(CMAKE_C_EXTENSIONS OFF)
set(CMAKE_CXX_COMPILER clang++)
set(CMAKE_CXX_EXTENSIONS OFF)
set(GGML_CUDA ON)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
endif()
if (GGML_OPENMP) if (GGML_OPENMP)
find_package(OpenMP) find_package(OpenMP)
if (OpenMP_FOUND) if (OpenMP_FOUND)
@ -160,12 +146,7 @@ if (GGML_OPENMP)
add_compile_definitions(GGML_USE_OPENMP) add_compile_definitions(GGML_USE_OPENMP)
list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
if (GGML_MUSA)
list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp")
list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
endif()
else() else()
message(WARNING "OpenMP not found") message(WARNING "OpenMP not found")
endif() endif()
@ -247,8 +228,8 @@ if (GGML_BLAS)
set(GGML_HEADERS_BLAS ../include/ggml-blas.h) set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
set(GGML_SOURCES_BLAS ggml-blas.cpp) set(GGML_SOURCES_BLAS ggml-blas.cpp)
list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES}) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${BLAS_LIBRARIES})
list(APPEND GGML_EXTRA_INCLUDES ${BLAS_INCLUDE_DIRS}) set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
else() else()
message(WARNING "BLAS not found, please refer to " message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors" "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@ -257,24 +238,18 @@ if (GGML_BLAS)
endif() endif()
if (GGML_LLAMAFILE) if (GGML_LLAMAFILE)
message(STATUS "Using llamafile") message(STATUS "Using ggml SGEMM")
add_compile_definitions(GGML_USE_LLAMAFILE) add_compile_definitions(GGML_USE_LLAMAFILE)
set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h) set(GGML_HEADERS_LLAMAFILE sgemm.h)
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp) set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
endif() endif()
if (GGML_CUDA) if (GGML_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
if (GGML_MUSA)
list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
find_package(MUSAToolkit)
set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
else()
find_package(CUDAToolkit) find_package(CUDAToolkit)
endif()
if (CUDAToolkit_FOUND) if (CUDAToolkit_FOUND)
message(STATUS "CUDA found") message(STATUS "CUDA found")
@ -293,11 +268,7 @@ if (GGML_CUDA)
endif() endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
if (GGML_MUSA)
set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
else()
enable_language(CUDA) enable_language(CUDA)
endif()
file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh") file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h") list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
@ -324,15 +295,21 @@ if (GGML_CUDA)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA) list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
# TODO: for now CUDA graphs should be used only with llama.cpp
# https://github.com/ggerganov/whisper.cpp/issues/2258
message(STATUS "CMAKE_PROJECT_NAME: ${CMAKE_PROJECT_NAME}")
if (CMAKE_PROJECT_NAME STREQUAL "llama.cpp")
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
message(STATUS "GGML_CUDA_USE_GRAPHS enabled")
else()
message(STATUS "GGML_CUDA_USE_GRAPHS disabled")
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X}) add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y}) add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER}) add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE}) add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
if (GGML_CUDA_GRAPHS)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
endif()
if (GGML_CUDA_FORCE_DMMV) if (GGML_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV) add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif() endif()
@ -361,40 +338,21 @@ if (GGML_CUDA)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY) add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif() endif()
if (GGML_MUSA)
set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
foreach(SOURCE ${GGML_SOURCES_CUDA})
set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
endforeach()
endif()
if (GGML_STATIC) if (GGML_STATIC)
if (WIN32) if (WIN32)
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library # As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else () else ()
if (GGML_MUSA) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static)
else()
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
endif() endif()
else() else()
if (GGML_MUSA) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas)
else()
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
endif() endif()
if (GGML_CUDA_NO_VMM) if (GGML_CUDA_NO_VMM)
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so) # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
else() else()
if (GGML_MUSA) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
else()
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
endif() endif()
else() else()
message(WARNING "CUDA not found") message(WARNING "CUDA not found")
@ -488,17 +446,13 @@ if (GGML_HIPBLAS)
add_compile_definitions(GGML_CUDA_FORCE_MMQ) add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif() endif()
if (GGML_CUDA_FORCE_CUBLAS)
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
endif()
if (GGML_CUDA_NO_PEER_COPY) if (GGML_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY) add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif() endif()
if (CXX_IS_HIPCC) if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device)
else() else()
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP) set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
endif() endif()
@ -507,34 +461,27 @@ if (GGML_HIPBLAS)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm") message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif() endif()
list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
endif() endif()
if (GGML_SYCL) if (GGML_SYCL)
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$") if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD") message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
endif() endif()
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL) if ( NOT DEFINED ENV{ONEAPI_ROOT})
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
if (DEFINED ENV{ONEAPI_ROOT})
message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
elseif(SUPPORTS_SYCL)
message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
source /opt/intel/oneapi/setvars.sh")
else()
message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
endif() endif()
message(STATUS "SYCL found")
#todo: AOT #todo: AOT
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
message(STATUS "SYCL found")
list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL) list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
if (GGML_SYCL_F16) if (GGML_SYCL_F16)
if (GGML_SYCL_TARGET STREQUAL "AMD")
message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
endif()
add_compile_definitions(GGML_SYCL_F16) add_compile_definitions(GGML_SYCL_F16)
endif() endif()
@ -542,18 +489,12 @@ if (GGML_SYCL)
add_compile_definitions(GGML_SYCL_FORCE_MMQ) add_compile_definitions(GGML_SYCL_FORCE_MMQ)
endif() endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl") add_compile_options(-I./) #include DPCT
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
if (GGML_SYCL_TARGET STREQUAL "NVIDIA") if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
# INFO: Allowed Sub_group_sizes are not consistent through all
# hip targets. For example, 64 is used for certain models, but the backend
# does not support it.
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
else()
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
endif() endif()
file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp") file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
@ -562,35 +503,16 @@ if (GGML_SYCL)
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp") file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp") list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
find_package(DNNL)
message("-- DNNL found:" ${DNNL_FOUND})
if (GGML_SYCL_TARGET STREQUAL "INTEL")
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
else()
add_compile_definitions(GGML_SYCL_DNNL=0)
endif()
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
endif()
if (WIN32) if (WIN32)
find_package(IntelSYCL REQUIRED) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
find_package(MKL REQUIRED)
list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
else() else()
add_compile_options(-I/${SYCL_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
if (GGML_SYCL_TARGET STREQUAL "INTEL") if (GGML_SYCL_TARGET STREQUAL "INTEL")
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA") elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda") set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
if (GGML_SYCL_HIP_TARGET STREQUAL "")
message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}")
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
endif() endif()
endif() endif()
endif() endif()
@ -601,7 +523,7 @@ if (GGML_RPC)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC) list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)
if (WIN32) if (WIN32)
list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32)
endif() endif()
set(GGML_HEADERS_RPC ../include/ggml-rpc.h) set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
@ -609,11 +531,14 @@ if (GGML_RPC)
endif() endif()
if (GGML_VULKAN) if (GGML_VULKAN)
find_package(Vulkan COMPONENTS glslc REQUIRED) find_package(Vulkan)
if (Vulkan_FOUND) if (Vulkan_FOUND)
message(STATUS "Vulkan found") message(STATUS "Vulkan found")
set(GGML_HEADERS_VULKAN ../include/ggml-vulkan.h)
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_VULKAN) list(APPEND GGML_CDEF_PUBLIC GGML_USE_VULKAN)
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build # Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
@ -634,14 +559,6 @@ if (GGML_VULKAN)
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG) add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
endif() endif()
if (GGML_VULKAN_SHADER_DEBUG_INFO)
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
endif()
if (GGML_VULKAN_PERF)
add_compile_definitions(GGML_VULKAN_PERF)
endif()
if (GGML_VULKAN_VALIDATE) if (GGML_VULKAN_VALIDATE)
add_compile_definitions(GGML_VULKAN_VALIDATE) add_compile_definitions(GGML_VULKAN_VALIDATE)
endif() endif()
@ -650,37 +567,7 @@ if (GGML_VULKAN)
add_compile_definitions(GGML_VULKAN_RUN_TESTS) add_compile_definitions(GGML_VULKAN_RUN_TESTS)
endif() endif()
add_subdirectory(vulkan-shaders) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
set (_ggml_vk_genshaders_cmd vulkan-shaders-gen)
set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
add_custom_command(
OUTPUT ${_ggml_vk_header}
${_ggml_vk_source}
COMMAND ${_ggml_vk_genshaders_cmd}
--glslc ${Vulkan_GLSLC_EXECUTABLE}
--input-dir ${_ggml_vk_input_dir}
--output-dir ${_ggml_vk_output_dir}
--target-hpp ${_ggml_vk_header}
--target-cpp ${_ggml_vk_source}
--no-clean
DEPENDS ${_ggml_vk_shader_deps}
COMMENT "Generate vulkan shaders"
)
set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan)
list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
else() else()
message(WARNING "Vulkan not found") message(WARNING "Vulkan not found")
endif() endif()
@ -839,8 +726,8 @@ if (GGML_KOMPUTE)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE) list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)
list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute) set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} kompute)
list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}) set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
else() else()
message(WARNING "Kompute not found") message(WARNING "Kompute not found")
endif() endif()
@ -856,71 +743,6 @@ if (GGML_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind) target_link_libraries(ggml PUBLIC memkind)
endif() endif()
if (GGML_CANN)
if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
endif()
if (CANN_INSTALL_DIR)
# Only Support Linux.
if (GGML_CANN)
if (NOT UNIX)
set(GGML_CANN OFF)
message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_CANN")
endif()
endif()
# Supported platforms: x86-64, arm64
if (GGML_CANN)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
else()
set(GGML_CANN OFF)
message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_CANN")
endif()
endif()
# Set header and libs
if(GGML_CANN)
set(CANN_INCLUDE_DIRS
${CANN_INSTALL_DIR}/include
${CANN_INSTALL_DIR}/include/aclnn
${CANN_INSTALL_DIR}/acllib/include
)
add_subdirectory(ggml-cann/kernels)
list(APPEND CANN_LIBRARIES
ascendcl
nnopbase
opapi
acl_op_compiler
ascendc_kernels
)
set(GGML_HEADERS_CANN "../include/ggml-cann.h")
file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp")
list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} )
list(APPEND GGML_EXTRA_INCLUDES ${CANN_INCLUDE_DIRS})
list(APPEND GGML_EXTRA_LIBDIRS ${CANN_INSTALL_DIR}/lib64)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
endif()
else()
set(GGML_CANN OFF)
message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off GGML_CANN")
endif()
if(NOT GGML_CANN)
message(WARNING "CANN: GGML_CANN is turned OFF, see above for details.")
endif()
endif()
function(get_flags CCID CCVER) function(get_flags CCID CCVER)
set(C_FLAGS "") set(C_FLAGS "")
set(CXX_FLAGS "") set(CXX_FLAGS "")
@ -939,11 +761,9 @@ function(get_flags CCID CCVER)
set(C_FLAGS -Wdouble-promotion) set(C_FLAGS -Wdouble-promotion)
set(CXX_FLAGS -Wno-array-bounds) set(CXX_FLAGS -Wno-array-bounds)
if (NOT GGML_MUSA)
if (CCVER VERSION_GREATER_EQUAL 7.1.0) if (CCVER VERSION_GREATER_EQUAL 7.1.0)
list(APPEND CXX_FLAGS -Wno-format-truncation) list(APPEND CXX_FLAGS -Wno-format-truncation)
endif() endif()
endif()
if (CCVER VERSION_GREATER_EQUAL 8.1.0) if (CCVER VERSION_GREATER_EQUAL 8.1.0)
list(APPEND CXX_FLAGS -Wextra-semi) list(APPEND CXX_FLAGS -Wextra-semi)
endif() endif()
@ -1201,7 +1021,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
endif() endif()
if (GGML_AVX512) if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f) list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512dq)
list(APPEND ARCH_FLAGS -mavx512bw) list(APPEND ARCH_FLAGS -mavx512bw)
endif() endif()
if (GGML_AVX512_VBMI) if (GGML_AVX512_VBMI)
@ -1275,7 +1094,7 @@ endif()
# Data types, macros and functions related to controlling CPU affinity and # Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc # some memory allocation are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android") if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions(_GNU_SOURCE) add_compile_definitions(_GNU_SOURCE)
endif() endif()
@ -1325,7 +1144,7 @@ add_library(ggml
../include/ggml-backend.h ../include/ggml-backend.h
ggml.c ggml.c
ggml-alloc.c ggml-alloc.c
ggml-backend.cpp ggml-backend.c
ggml-quants.c ggml-quants.c
ggml-quants.h ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA} ${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
@ -1338,8 +1157,6 @@ add_library(ggml
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM} ${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS} ${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE} ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
ggml-aarch64.c ggml-aarch64.h
) )
if (EMSCRIPTEN) if (EMSCRIPTEN)
@ -1349,23 +1166,15 @@ endif()
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC}) target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
target_include_directories(ggml PUBLIC ../include) target_include_directories(ggml PUBLIC ../include)
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES}) target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
target_compile_features (ggml PRIVATE c_std_11) # don't bump target_compile_features (ggml PRIVATE c_std_11) # don't bump
list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads) target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
find_library(MATH_LIBRARY m) find_library(MATH_LIBRARY m)
if (MATH_LIBRARY) if (MATH_LIBRARY)
if (NOT WIN32 OR NOT GGML_SYCL) target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
endif()
endif() endif()
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
if (BUILD_SHARED_LIBS) if (BUILD_SHARED_LIBS)
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
endif() endif()

File diff suppressed because it is too large Load Diff

View File

@ -1,39 +0,0 @@
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
#pragma once
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml.h"
// GGML internal header
#ifdef __cplusplus
extern "C" {
#endif
// Quantization
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// GEMV
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
// GEMM
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#ifdef __cplusplus
}
#endif

View File

@ -91,7 +91,8 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
GGML_ABORT("not enough space in the buffer"); GGML_ASSERT(!"not enough space in the buffer");
return;
} }
void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
@ -132,7 +133,7 @@ static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset,
return; return;
} }
} }
GGML_ABORT("out of allocated_tensors"); GGML_ASSERT(!"out of allocated_tensors");
} }
static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) { for (int i = 0; i < 1024; i++) {
@ -141,7 +142,8 @@ static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offs
return; return;
} }
} }
GGML_ABORT("tried to free tensor %s not found\n", tensor->name); fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
GGML_ASSERT(!"tensor not found");
} }
#endif #endif
@ -174,7 +176,8 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
// this should never happen // this should never happen
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail); __func__, size, max_avail);
GGML_ABORT("not enough space in the buffer"); GGML_ASSERT(!"not enough space in the buffer");
GGML_UNREACHABLE();
} }
} }
@ -294,12 +297,6 @@ static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
alloc->free_blocks[0].offset = 0; alloc->free_blocks[0].offset = 0;
alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
alloc->max_size = 0; alloc->max_size = 0;
#ifdef GGML_ALLOCATOR_DEBUG
for (int i = 0; i < 1024; i++) {
alloc->allocated_tensors[i].tensor = NULL;
}
#endif
} }
static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) { static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
@ -446,7 +443,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
} }
} }
ggml_hash_set_free(&galloc->hash_set); free(galloc->hash_set.keys);
free(galloc->hash_values); free(galloc->hash_values);
free(galloc->bufts); free(galloc->bufts);
free(galloc->buffers); free(galloc->buffers);
@ -459,7 +456,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
typedef struct ggml_gallocr * ggml_gallocr_t; typedef struct ggml_gallocr * ggml_gallocr_t;
static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
size_t i = ggml_hash_find_or_insert(&galloc->hash_set, t); size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
return &galloc->hash_values[i]; return &galloc->hash_values[i];
} }
@ -568,8 +565,8 @@ static int get_node_buffer_id(const int * node_buffer_ids, int i) {
static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
// clear hash tables // clear hash tables
ggml_hash_set_reset(&galloc->hash_set); memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
// allocate leafs // allocate leafs
// these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes // these may be tensors that the application is not using in the graph, but may still want to allocate for other purposes
@ -674,19 +671,21 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
} }
bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) { bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids, const int * leaf_buffer_ids) {
size_t min_hash_size = graph->n_nodes + graph->n_leafs; size_t hash_size = graph->visited_hash_table.size;
// add 25% margin to avoid hash collisions
min_hash_size += min_hash_size / 4;
// initialize hash table // initialize hash table
if (galloc->hash_set.size < min_hash_size) { if (galloc->hash_set.size < hash_size) {
ggml_hash_set_free(&galloc->hash_set); free(galloc->hash_set.keys);
galloc->hash_set = ggml_hash_set_new(min_hash_size);
GGML_ASSERT(galloc->hash_set.keys != NULL);
free(galloc->hash_values); free(galloc->hash_values);
galloc->hash_values = malloc(sizeof(struct hash_node) * galloc->hash_set.size); galloc->hash_set.size = hash_size;
galloc->hash_set.keys = calloc(hash_size, sizeof(struct ggml_tensor *));
galloc->hash_values = calloc(hash_size, sizeof(struct hash_node));
GGML_ASSERT(galloc->hash_set.keys != NULL);
GGML_ASSERT(galloc->hash_values != NULL); GGML_ASSERT(galloc->hash_values != NULL);
} else {
// reset hash table
memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
} }
// reset allocators // reset allocators
@ -777,7 +776,6 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false; return false;
} }
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
} }
} }
@ -818,7 +816,8 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
} }
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) { static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node); ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
return talloc->size_max >= node_size; return talloc->size_max >= node_size;
} }

View File

@ -9,226 +9,144 @@ extern "C" {
#endif #endif
// //
// Backend buffer type // Backend buffer
// //
// buffer type
typedef void * ggml_backend_buffer_type_context_t;
struct ggml_backend_buffer_type_i { struct ggml_backend_buffer_type_i {
const char * (*get_name) (ggml_backend_buffer_type_t buft); const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
// allocate a buffer of this type // allocate a buffer of this type
ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
// tensor alignment // tensor alignment
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
// (optional) max buffer size that can be allocated (defaults to SIZE_MAX) // max buffer size that can be allocated
size_t (*get_max_size) (ggml_backend_buffer_type_t buft); size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes) // data size needed to allocate the tensor, including padding
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// (optional) check if tensor data is in host memory (defaults to false) // check if tensor data is in host memory
bool (*is_host) (ggml_backend_buffer_type_t buft); bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
}; };
struct ggml_backend_buffer_type { struct ggml_backend_buffer_type {
struct ggml_backend_buffer_type_i iface; struct ggml_backend_buffer_type_i iface;
ggml_backend_dev_t device; ggml_backend_buffer_type_context_t context;
void * context;
}; };
// // buffer
// Backend buffer typedef void * ggml_backend_buffer_context_t;
//
struct ggml_backend_buffer_i { struct ggml_backend_buffer_i {
const char * (*get_name) (ggml_backend_buffer_t buffer); const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
// (optional) free the buffer void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
void (*free_buffer) (ggml_backend_buffer_t buffer); void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
// base address of the buffer void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
void * (*get_base) (ggml_backend_buffer_t buffer); void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
// (optional) initialize a tensor in the buffer (eg. add tensor extras) void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
// tensor data access void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
void (*memset_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size); void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// (optional) tensor copy: dst is in the buffer, src may be in any buffer, including buffers from a different backend (return false if not supported)
bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst);
// clear the entire buffer
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
// (optional) reset any internal state due to tensor initialization, such as tensor extras
void (*reset) (ggml_backend_buffer_t buffer);
}; };
struct ggml_backend_buffer { struct ggml_backend_buffer {
struct ggml_backend_buffer_i iface; struct ggml_backend_buffer_i iface;
ggml_backend_buffer_type_t buft; ggml_backend_buffer_type_t buft;
void * context; ggml_backend_buffer_context_t context;
size_t size; size_t size;
enum ggml_backend_buffer_usage usage; enum ggml_backend_buffer_usage usage;
}; };
ggml_backend_buffer_t ggml_backend_buffer_init( GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft, ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface, struct ggml_backend_buffer_i iface,
void * context, ggml_backend_buffer_context_t context,
size_t size); size_t size);
// do not use directly, use ggml_backend_tensor_copy instead // do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
// multi-buffer
// buffer that contains a collection of buffers // buffer that contains a collection of buffers
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers); GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer); GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
// //
// Backend (stream) // Backend
// //
typedef void * ggml_backend_context_t;
struct ggml_backend_i { struct ggml_backend_i {
const char * (*get_name)(ggml_backend_t backend); const char * (*GGML_CALL get_name)(ggml_backend_t backend);
void (*free)(ggml_backend_t backend); void (*GGML_CALL free)(ggml_backend_t backend);
// buffer allocation // buffer allocation
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
// (optional) asynchronous tensor data access // (optional) asynchronous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst); bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations // (optional) complete all pending operations
void (*synchronize)(ggml_backend_t backend); void (*GGML_CALL synchronize)(ggml_backend_t backend);
// (optional) compute graph with a plan (not used currently) // compute graph with a plan (not used currently)
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); // create a new plan for a graph
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
void (*graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph); void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
// compute the graph with the plan // compute the graph with the plan
enum ggml_status (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph (always async if supported by the backend) // compute graph without a plan (async)
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph); enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
// new backends should implement the device interface instead
// These functions are being moved to the device interface
// check if the backend can compute an operation // check if the backend can compute an operation
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op); bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// check if the backend can use tensors allocated in a buffer type // check if the backend can use tensors allocated in a buffer type
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft); bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend // these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily // even if the weight has to be copied from the CPU temporarily
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op); bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization // (optional) event synchronization
// record an event on this stream // create a new event that can record events on this backend instance
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event); ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
// wait for an event on on a different stream void (*GGML_CALL event_free) (ggml_backend_event_t event);
void (*event_wait) (ggml_backend_t backend, ggml_backend_event_t event); // record an event on the backend instance that created it
void (*GGML_CALL event_record) (ggml_backend_event_t event);
// wait for an event on on a different backend instance
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
// block until an event is recorded
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
}; };
struct ggml_backend { struct ggml_backend {
ggml_guid_t guid; ggml_guid_t guid;
struct ggml_backend_i iface; struct ggml_backend_i iface;
ggml_backend_dev_t device; ggml_backend_context_t context;
void * context;
}; };
struct ggml_backend_event { struct ggml_backend_event {
struct ggml_backend_device * device; ggml_backend_t backend;
void * context; void * context;
}; };
// //
// Backend device // Backend registry
// //
// Note: if additional properties are needed, we should add a struct with all of them typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
// the current functions to obtain the properties can remain, since they are more convenient for often used properties
struct ggml_backend_device_i {
// device name: short identifier for this device, such as "CPU" or "CUDA0"
const char * (*get_name)(ggml_backend_dev_t dev);
// device description: short informative description of the device, could be the model name GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
const char * (*get_description)(ggml_backend_dev_t dev);
// device memory in bytes
void (*get_memory)(ggml_backend_dev_t dev, size_t * free, size_t * total);
// device type
enum ggml_backend_dev_type (*get_type)(ggml_backend_dev_t dev);
// device properties
void (*get_props)(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props);
// backend (stream) initialization
ggml_backend_t (*init_backend)(ggml_backend_dev_t dev, const char * params);
// preferred buffer type
ggml_backend_buffer_type_t (*get_buffer_type)(ggml_backend_dev_t dev);
// (optional) host buffer type (in system memory, typically this is a pinned memory buffer for faster transfers between host and device)
ggml_backend_buffer_type_t (*get_host_buffer_type)(ggml_backend_dev_t dev);
// (optional) buffer from pointer: create a buffer from a host pointer (useful for memory mapped models and importing data from other libraries)
ggml_backend_buffer_t (*buffer_from_host_ptr)(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size);
// check if the backend can compute an operation
bool (*supports_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
// check if the backend can use tensors allocated in a buffer type
bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily
bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
// (optional) event synchronization
ggml_backend_event_t (*event_new) (ggml_backend_dev_t dev);
void (*event_free) (ggml_backend_dev_t dev, ggml_backend_event_t event);
void (*event_synchronize) (ggml_backend_dev_t dev, ggml_backend_event_t event);
};
struct ggml_backend_device {
struct ggml_backend_device_i iface;
ggml_backend_reg_t reg;
void * context;
};
//
// Backend (reg)
//
struct ggml_backend_reg_i {
const char * (*get_name)(ggml_backend_reg_t reg);
// enumerate available devices
size_t (*get_device_count)(ggml_backend_reg_t reg);
ggml_backend_dev_t (*get_device)(ggml_backend_reg_t reg, size_t index);
// (optional) get a pointer to a function in the backend
// backends can add custom functions that are not part of the standard ggml-backend interface
void * (*get_proc_address)(ggml_backend_reg_t reg, const char * name);
};
struct ggml_backend_reg {
// int api_version; // TODO: for dynamic loading
struct ggml_backend_reg_i iface;
void * context;
};
// Internal backend registry API
void ggml_backend_register(ggml_backend_reg_t reg);
void ggml_backend_device_register(ggml_backend_dev_t device);
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
// typedef ggml_backend_register_t * (*ggml_backend_init)(void);
#ifdef __cplusplus #ifdef __cplusplus
} }

Some files were not shown because too many files have changed in this diff Show More