Compare commits

..

1 Commits

Author SHA1 Message Date
ceb77363cd ggml : disable CUDA graphs for non-llama.cpp projects 2024-06-26 20:14:22 +03:00
288 changed files with 184949 additions and 54489 deletions

View File

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential git cmake libsdl2-dev
apt-get install -y build-essential git cmake
WORKDIR /app

View File

@ -17,7 +17,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
ENV GGML_CUDA=1
RUN apt-get update && \
apt-get install -y build-essential libsdl2-dev \
apt-get install -y build-essential \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
# Ref: https://stackoverflow.com/a/53464012

View File

@ -12,7 +12,7 @@ FROM ubuntu:22.04 AS runtime
WORKDIR /app
RUN apt-get update && \
apt-get install -y curl ffmpeg libsdl2-dev \
apt-get install -y curl ffmpeg \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
COPY --from=build /app /app

View File

@ -13,10 +13,10 @@ jobs:
ubuntu-latest:
runs-on: ubuntu-latest
steps:
- uses: actions/setup-go@v5
- uses: actions/setup-go@v3
with:
go-version: '^1.23'
- uses: actions/checkout@v4
go-version: '^1.19'
- uses: actions/checkout@v1
- run: |
cd bindings/go
make test

View File

@ -3,73 +3,20 @@ on:
push:
paths:
- bindings/ruby/**
- src/whisper.cpp
- include/whisper.h
- ggml/src/ggml.c
- ggml/src/ggml-impl.h
- ggml/src/ggml-aarch64.h
- ggml/src/ggml-aarch64.c
- ggml/src/ggml-alloc.c
- ggml/src/ggml-backend-impl.h
- ggml/src/ggml-backend.cpp
- ggml/src/ggml-common.h
- ggml/src/ggml-quants.h
- ggml/src/ggml-quants.c
- ggml/src/ggml-cpu-impl.h
- ggml/src/ggml-metal.m
- ggml/src/ggml-metal.metal
- ggml/src/ggml-blas.cpp
- ggml/include/ggml.h
- ggml/include/ggml-alloc.h
- ggml/include/ggml-backend.h
- ggml/include/ggml-cuda.h
- ggml/include/ggml-kompute.h
- ggml/include/ggml-metal.h
- ggml/include/ggml-sycl.h
- ggml/include/ggml-vulkan.h
- ggml/include/ggml-blas.h
- scripts/get-flags.mk
- examples/dr_wav.h
- whisper.h
pull_request:
paths:
- bindings/ruby/**
- src/whisper.cpp
- include/whisper.h
- ggml/src/ggml.c
- ggml/src/ggml-impl.h
- ggml/src/ggml-aarch64.h
- ggml/src/ggml-aarch64.c
- ggml/src/ggml-alloc.c
- ggml/src/ggml-backend-impl.h
- ggml/src/ggml-backend.cpp
- ggml/src/ggml-common.h
- ggml/src/ggml-quants.h
- ggml/src/ggml-quants.c
- ggml/src/ggml-cpu-impl.h
- ggml/src/ggml-metal.m
- ggml/src/ggml-metal.metal
- ggml/src/ggml-blas.cpp
- ggml/include/ggml.h
- ggml/include/ggml-alloc.h
- ggml/include/ggml-backend.h
- ggml/include/ggml-cuda.h
- ggml/include/ggml-kompute.h
- ggml/include/ggml-metal.h
- ggml/include/ggml-sycl.h
- ggml/include/ggml-vulkan.h
- ggml/include/ggml-blas.h
- scripts/get-flags.mk
- examples/dr_wav.h
- whisper.h
jobs:
ubuntu-latest:
runs-on: ubuntu-latest
defaults:
run:
working-directory: bindings/ruby
steps:
- uses: ruby/setup-ruby@v1
with:
ruby-version: '3.0'
- uses: actions/checkout@v4
- run: rake test
- uses: actions/checkout@v1
- run: |
cd bindings/ruby/ext
ruby extconf.rb && make

View File

@ -59,7 +59,7 @@ jobs:
uses: cross-platform-actions/action@v0.24.0
with:
operating_system: freebsd
version: '13.3'
version: '13.2'
run: |
sudo pkg update
sudo pkg install -y gmake sdl2
@ -586,75 +586,73 @@ jobs:
cd whisper/examples/whisper.android
./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML
# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
# android_java:
# runs-on: ubuntu-latest
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: set up JDK 11
# uses: actions/setup-java@v4
# with:
# java-version: '11'
# distribution: 'temurin'
# cache: gradle
#
# - name: Setup Android SDK
# uses: android-actions/setup-android@v3
# with:
# cmdline-tools-version: 9.0
#
# - name: Build
# run: |
# cd examples/whisper.android.java
# chmod +x ./gradlew
# ./gradlew assembleRelease
android_java:
runs-on: ubuntu-latest
# TODO: disabled because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/9686220096/job/26735899598
# java:
# needs: [ 'windows' ]
# runs-on: windows-latest
# steps:
# - uses: actions/checkout@v4
#
# - name: Install Java
# uses: actions/setup-java@v4
# with:
# distribution: zulu
# java-version: 20
#
# - name: Download Windows lib
# uses: actions/download-artifact@v4
# with:
# name: win32-x86-64_whisper.dll
# path: bindings/java/build/generated/resources/main/win32-x86-64
#
# - name: Build
# run: |
# models\download-ggml-model.cmd tiny.en
# cd bindings/java
# chmod +x ./gradlew
# ./gradlew build
#
# - name: Upload jar
# uses: actions/upload-artifact@v4
# with:
# name: whispercpp.jar
# path: bindings/java/build/libs/whispercpp-*.jar
#
# - name: Publish package
# if: ${{ github.ref == 'refs/heads/master' }}
# uses: gradle/gradle-build-action@v2.4.2
# with:
# arguments: publish
# build-root-directory: bindings/java
# env:
# MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
# MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
# PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
# PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
steps:
- name: Clone
uses: actions/checkout@v4
- name: set up JDK 11
uses: actions/setup-java@v4
with:
java-version: '11'
distribution: 'temurin'
cache: gradle
- name: Setup Android SDK
uses: android-actions/setup-android@v3
with:
cmdline-tools-version: 9.0
- name: Build
run: |
cd examples/whisper.android.java
chmod +x ./gradlew
./gradlew assembleRelease
java:
needs: [ 'windows' ]
runs-on: windows-latest
steps:
- uses: actions/checkout@v4
- name: Install Java
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: 20
- name: Download Windows lib
uses: actions/download-artifact@v4
with:
name: win32-x86-64_whisper.dll
path: bindings/java/build/generated/resources/main/win32-x86-64
- name: Build
run: |
models\download-ggml-model.cmd tiny.en
cd bindings/java
chmod +x ./gradlew
./gradlew build
- name: Upload jar
uses: actions/upload-artifact@v4
with:
name: whispercpp.jar
path: bindings/java/build/libs/whispercpp-*.jar
- name: Publish package
if: ${{ github.ref == 'refs/heads/master' }}
uses: gradle/gradle-build-action@v2.4.2
with:
arguments: publish
build-root-directory: bindings/java
env:
MAVEN_USERNAME: ${{ secrets.JIRA_USER }}
MAVEN_PASSWORD: ${{ secrets.JIRA_PASS }}
PGP_SECRET: ${{ secrets.GPG_PRIVATE_KEY }}
PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
quantize:
runs-on: ubuntu-latest

View File

@ -18,9 +18,7 @@ jobs:
matrix:
config:
- { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
#TODO: the cuda image keeps failing - disable for now
# https://github.com/ggerganov/whisper.cpp/actions/runs/11019444428/job/30602020339
#- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
steps:
- name: Check out the repo

2
.gitignore vendored
View File

@ -3,13 +3,11 @@
.cache/
.coreml/
.test/
.venv/
.vs/
.vscode/
.DS_Store
.vimspector.json
/CMakeSettings.json
/talk-llama.dSYM/
build/
build-*/

View File

@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
project("whisper.cpp" C CXX)
project("whisper.cpp" VERSION 1.7.1)
project("whisper.cpp" VERSION 1.6.2)
include(CheckIncludeFileCXX)
set(SOVERSION 1)
@ -120,10 +120,7 @@ whisper_option_depr(WARNING WHISPER_SYCL_F16 GGML_SYCL_F16)
# build the library
#
if (NOT TARGET ggml)
add_subdirectory(ggml)
# ... otherwise assume ggml is added by a parent CMakeLists.txt
endif()
add_subdirectory(ggml)
add_subdirectory(src)
#
@ -164,6 +161,18 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/whisper-config.cmake
${CMAKE_CURRENT_BINARY_DIR}/whisper-version.cmake
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/whisper)
install(
FILES convert-hf-to-gguf.py
PERMISSIONS
OWNER_READ
OWNER_WRITE
OWNER_EXECUTE
GROUP_READ
GROUP_EXECUTE
WORLD_READ
WORLD_EXECUTE
DESTINATION ${CMAKE_INSTALL_BINDIR})
configure_file(cmake/whisper.pc.in
"${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
@ONLY)

104
Makefile
View File

@ -3,11 +3,12 @@ BUILD_TARGETS = \
main \
bench \
quantize \
server
server \
tests/test-c.o
# Binaries only useful for tests
TEST_TARGETS = \
tests/test-c.o
tests/test-backend-ops
# Deprecation aliases
ifdef WHISPER_CUBLAS
@ -134,18 +135,14 @@ ifdef GGML_RPC
BUILD_TARGETS += rpc-server
endif
ifdef GGML_VULKAN
BUILD_TARGETS += vulkan-shaders-gen
endif
ifeq ($(shell sdl2-config --cflags --libs 2>/dev/null),)
else
BUILD_TARGETS += \
command \
stream \
lsp \
talk \
talk-llama
# talk (TODO: disalbed)
endif
default: $(BUILD_TARGETS)
@ -254,10 +251,7 @@ ifdef WHISPER_DEBUG
MK_CPPFLAGS += -D_GLIBCXX_ASSERTIONS
endif
else
MK_CPPFLAGS += -DNDEBUG
MK_CFLAGS += -O3
MK_CXXFLAGS += -O3
MK_NVCCFLAGS += -O3
MK_CPPFLAGS += -DNDEBUG
endif
ifdef WHISPER_SANITIZE_THREAD
@ -507,15 +501,16 @@ ifdef GGML_CUDA
CUDA_PATH ?= /usr/local/cuda
endif
#MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
#MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcufft -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcufft -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
MK_NVCCFLAGS += -use_fast_math
OBJ_GGML += ggml/src/ggml-cuda.o
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
OBJ_GGML += $(OBJ_CUDA_TMPL)
OBJ_WHISPER += src/whisper-mel-cuda.o
ifdef WHISPER_FATAL_WARNINGS
MK_NVCCFLAGS += -Werror all-warnings
endif # WHISPER_FATAL_WARNINGS
@ -624,12 +619,16 @@ ggml/src/ggml-cuda.o: \
ggml/src/ggml-common.h \
$(wildcard ggml/src/ggml-cuda/*.cuh)
$(NVCC_COMPILE)
src/whisper-mel-cuda.o: src/whisper-mel-cuda.cu src/whisper-mel-cuda.hpp
$(NVCC) $(NVCCFLAGS) $(CPPFLAGS) -Xcompiler "$(CUDA_CXXFLAGS)" -c $< -o $@
endif # GGML_CUDA
ifdef GGML_VULKAN
MK_CPPFLAGS += -DGGML_USE_VULKAN
MK_LDFLAGS += $(shell pkg-config --libs vulkan)
OBJ_GGML += ggml/src/ggml-vulkan.o ggml/src/ggml-vulkan-shaders.o
MK_LDFLAGS += -lvulkan
OBJ_GGML += ggml/src/ggml-vulkan.o
ifdef GGML_VULKAN_CHECK_RESULTS
MK_CPPFLAGS += -DGGML_VULKAN_CHECK_RESULTS
@ -643,10 +642,6 @@ ifdef GGML_VULKAN_MEMORY_DEBUG
MK_CPPFLAGS += -DGGML_VULKAN_MEMORY_DEBUG
endif
ifdef GGML_VULKAN_PERF
MK_CPPFLAGS += -DGGML_VULKAN_PERF
endif
ifdef GGML_VULKAN_VALIDATE
MK_CPPFLAGS += -DGGML_VULKAN_VALIDATE
endif
@ -655,28 +650,10 @@ ifdef GGML_VULKAN_RUN_TESTS
MK_CPPFLAGS += -DGGML_VULKAN_RUN_TESTS
endif
GLSLC_CMD = glslc
_ggml_vk_genshaders_cmd = $(shell pwd)/vulkan-shaders-gen
_ggml_vk_header = ggml/src/ggml-vulkan-shaders.hpp
_ggml_vk_source = ggml/src/ggml-vulkan-shaders.cpp
_ggml_vk_input_dir = ggml/src/vulkan-shaders
_ggml_vk_shader_deps = $(echo $(_ggml_vk_input_dir)/*.comp)
ggml/src/ggml-vulkan.o: ggml/src/ggml-vulkan.cpp ggml/include/ggml-vulkan.h $(_ggml_vk_header) $(_ggml_vk_source)
$(CXX) $(CXXFLAGS) $(shell pkg-config --cflags vulkan) -c $< -o $@
$(_ggml_vk_header): $(_ggml_vk_source)
$(_ggml_vk_source): $(_ggml_vk_shader_deps) vulkan-shaders-gen
$(_ggml_vk_genshaders_cmd) \
--glslc $(GLSLC_CMD) \
--input-dir $(_ggml_vk_input_dir) \
--target-hpp $(_ggml_vk_header) \
--target-cpp $(_ggml_vk_source)
vulkan-shaders-gen: ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
$(CXX) $(CXXFLAGS) -o $@ $(LDFLAGS) ggml/src/vulkan-shaders/vulkan-shaders-gen.cpp
ggml/src/ggml-vulkan.o: \
ggml/src/ggml-vulkan.cpp \
ggml/include/ggml-vulkan.h
$(CXX) $(CXXFLAGS) -c $< -o $@
endif # GGML_VULKAN
ifdef GGML_HIPBLAS
@ -803,8 +780,7 @@ OBJ_GGML += \
ggml/src/ggml.o \
ggml/src/ggml-alloc.o \
ggml/src/ggml-backend.o \
ggml/src/ggml-quants.o \
ggml/src/ggml-aarch64.o
ggml/src/ggml-quants.o
OBJ_WHISPER += \
src/whisper.o
@ -923,10 +899,10 @@ ggml/src/ggml-alloc.o: \
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-backend.o: \
ggml/src/ggml-backend.cpp \
ggml/src/ggml-backend.c \
ggml/include/ggml.h \
ggml/include/ggml-backend.h
$(CXX) $(CXXFLAGS) -c $< -o $@
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-quants.o: \
ggml/src/ggml-quants.c \
@ -935,13 +911,6 @@ ggml/src/ggml-quants.o: \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-aarch64.o: \
ggml/src/ggml-aarch64.c \
ggml/include/ggml.h \
ggml/src/ggml-aarch64.h \
ggml/src/ggml-common.h
$(CC) $(CFLAGS) -c $< -o $@
ggml/src/ggml-blas.o: \
ggml/src/ggml-blas.cpp \
ggml/include/ggml-blas.h
@ -974,6 +943,7 @@ $(LIB_GGML_S): \
src/whisper.o: \
src/whisper.cpp \
src/whisper-mel.hpp \
include/whisper.h \
ggml/include/ggml.h \
ggml/include/ggml-alloc.h \
@ -988,8 +958,7 @@ $(LIB_WHISPER): \
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
$(LIB_WHISPER_S): \
$(OBJ_WHISPER) \
$(OBJ_GGML)
$(OBJ_WHISPER)
ar rcs $(LIB_WHISPER_S) $^
# common
@ -1066,6 +1035,9 @@ main: examples/main/main.cpp \
$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
@echo
@echo '==== Run ./llama-cli -h for help. ===='
@echo
bench: examples/bench/bench.cpp \
$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON)
@ -1097,14 +1069,12 @@ lsp: examples/lsp/lsp.cpp \
$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
# TODO: disabled until update
# https://github.com/ggerganov/whisper.cpp/issues/1818
#talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp \
# $(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
# $(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
# $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp \
$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/llama-vocab.cpp examples/talk-llama/llama-grammar.cpp examples/talk-llama/llama-sampling.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp \
talk-llama: examples/talk-llama/talk-llama.cpp examples/talk-llama/llama.cpp examples/talk-llama/unicode.cpp examples/talk-llama/unicode-data.cpp \
$(OBJ_GGML) $(OBJ_WHISPER) $(OBJ_COMMON) $(OBJ_SDL)
$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) $(LDFLAGS_SDL)
@ -1118,6 +1088,11 @@ tests: $(TEST_TARGETS)
tests/test-c.o: tests/test-c.c include/whisper.h
$(CC) $(CFLAGS) -c $(filter-out %.h,$^) -o $@
tests/test-backend-ops: tests/test-backend-ops.cpp \
$(OBJ_GGML)
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
#
# Audio samples
#
@ -1163,9 +1138,8 @@ samples:
.PHONY: large-v1
.PHONY: large-v2
.PHONY: large-v3
.PHONY: large-v3-turbo
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3 large-v3-turbo: main
tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large-v3: main
bash ./models/download-ggml-model.sh $@
@echo ""
@echo "==============================================="

View File

@ -32,9 +32,8 @@ let package = Package(
sources: [
"ggml/src/ggml.c",
"src/whisper.cpp",
"ggml/src/ggml-aarch64.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.cpp",
"ggml/src/ggml-backend.c",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-metal.m"
],

111
README.md
View File

@ -7,23 +7,21 @@
[![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
Stable: [v1.7.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
Stable: [v1.6.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.6.0) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
- Plain C/C++ implementation without dependencies
- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](#core-ml-support)
- Apple Silicon first-class citizen - optimized via ARM NEON, Accelerate framework, Metal and [Core ML](https://github.com/ggerganov/whisper.cpp#core-ml-support)
- AVX intrinsics support for x86 architectures
- VSX intrinsics support for POWER architectures
- Mixed F16 / F32 precision
- [4-bit and 5-bit integer quantization support](#quantization)
- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
- Zero memory allocations at runtime
- [Vulkan support](#vulkan-gpu-support)
- Support for CPU-only inference
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
- [OpenVINO Support](#openvino-support)
- [Ascend NPU Support](#ascend-npu-support)
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
- [Efficient GPU support for NVIDIA](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
- [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
Supported platforms:
@ -35,9 +33,9 @@ Supported platforms:
- [x] [WebAssembly](examples/whisper.wasm)
- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
@ -57,8 +55,8 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
## Implementation details
- The core tensor operations are implemented in C ([ggml.h](ggml/include/ggml.h) / [ggml.c](ggml/src/ggml.c))
- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](include/whisper.h) / [whisper.cpp](src/whisper.cpp))
- The core tensor operations are implemented in C ([ggml.h](ggml.h) / [ggml.c](ggml.c))
- The transformer model and the high-level C-style API are implemented in C++ ([whisper.h](whisper.h) / [whisper.cpp](whisper.cpp))
- Sample usage is demonstrated in [main.cpp](examples/main)
- Sample real-time audio transcription from the microphone is demonstrated in [stream.cpp](examples/stream)
- Various other examples are available in the [examples](examples) folder
@ -73,23 +71,17 @@ First clone the repository:
git clone https://github.com/ggerganov/whisper.cpp.git
```
Navigate into the directory:
```
cd whisper.cpp
```
Then, download one of the Whisper [models](models/README.md) converted in [`ggml` format](#ggml-format). For example:
```bash
sh ./models/download-ggml-model.sh base.en
bash ./models/download-ggml-model.sh base.en
```
Now build the [main](examples/main) example and transcribe an audio file like this:
```bash
# build the main example
make -j
make
# transcribe an audio file
./main -f samples/jfk.wav
@ -100,7 +92,7 @@ make -j
For a quick demo, simply run `make base.en`:
```text
$ make -j base.en
$ make base.en
cc -I. -O3 -std=c11 -pthread -DGGML_USE_ACCELERATE -c ggml.c -o ggml.o
c++ -I. -I./examples -O3 -std=c++11 -pthread -c whisper.cpp -o whisper.o
@ -153,7 +145,7 @@ options:
-ng, --no-gpu [false ] disable GPU
sh ./models/download-ggml-model.sh base.en
bash ./models/download-ggml-model.sh base.en
Downloading ggml model base.en ...
ggml-base.en.bin 100%[========================>] 141.11M 6.34MB/s in 24s
Done! Model 'base.en' saved in 'models/ggml-base.en.bin'
@ -224,7 +216,7 @@ ffmpeg -i input.mp3 -ar 16000 -ac 1 -c:a pcm_s16le output.wav
If you want some extra audio samples to play with, simply run:
```
make -j samples
make samples
```
This will download a few more audio files from Wikipedia and convert them to 16-bit WAV format via `ffmpeg`.
@ -232,18 +224,17 @@ This will download a few more audio files from Wikipedia and convert them to 16-
You can download and run the other models as follows:
```
make -j tiny.en
make -j tiny
make -j base.en
make -j base
make -j small.en
make -j small
make -j medium.en
make -j medium
make -j large-v1
make -j large-v2
make -j large-v3
make -j large-v3-turbo
make tiny.en
make tiny
make base.en
make base
make small.en
make small
make medium.en
make medium
make large-v1
make large-v2
make large-v3
```
## Memory usage
@ -265,7 +256,7 @@ Here are the steps for creating and using a quantized model:
```bash
# quantize a model with Q5_0 method
make -j quantize
make quantize
./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
# run the examples as usual, specifying the quantized model file
@ -430,16 +421,6 @@ make clean
GGML_CUDA=1 make -j
```
## Vulkan GPU support
Cross-vendor solution which allows you to accelerate workload on your GPU.
First, make sure your graphics card driver provides support for Vulkan API.
Now build `whisper.cpp` with Vulkan support:
```
make clean
make GGML_VULKAN=1 -j
```
## BLAS CPU support via OpenBLAS
Encoder processing can be accelerated on the CPU via OpenBLAS.
@ -467,39 +448,6 @@ cmake -DWHISPER_MKL=ON ..
WHISPER_MKL=1 make -j
```
## Ascend NPU support
Ascend NPU provides inference acceleration via [`CANN`](https://www.hiascend.com/en/software/cann) and AI cores.
First, check if your Ascend NPU device is supported:
**Verified devices**
| Ascend NPU | Status |
|:-----------------------------:|:-------:|
| Atlas 300T A2 | Support |
Then, make sure you have installed [`CANN toolkit`](https://www.hiascend.com/en/software/cann/community) . The lasted version of CANN is recommanded.
Now build `whisper.cpp` with CANN support:
```
mkdir build
cd build
cmake .. -D GGML_CANN=on
make -j
```
Run the inference examples as usual, for example:
```
./build/bin/main -f samples/jfk.wav -m models/ggml-base.en.bin -t 8
```
*Notes:*
- If you have trouble with Ascend NPU device, please create a issue with **[CANN]** prefix/tag.
- If you run successfully with your Ascend NPU device, please help update the table `Verified devices`.
## Docker
### Prerequisites
@ -636,7 +584,7 @@ The [stream](examples/stream) tool samples the audio every half a second and run
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
```bash
make stream -j
make stream
./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
```
@ -803,7 +751,7 @@ took to execute it. The results are summarized in the following Github issue:
[Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).
Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](bench.py).
You can run it with the following command, by default it will run against any standard model in the models folder.
@ -850,7 +798,6 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
- [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
- [abdeladim-s/pywhispercpp](https://github.com/abdeladim-s/pywhispercpp) (Pybind11)
- [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
- [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)

View File

@ -14,14 +14,9 @@ GGML_METAL_PATH_RESOURCES := $(abspath ../..)
BUILD_DIR := build
MODELS_DIR := models
EXAMPLES_DIR := $(wildcard examples/*)
INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
INCLUDE_PATH := $(abspath ../..)
LIBRARY_PATH := $(abspath ../..)
ifeq ($(GGML_CUDA),1)
LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
BUILD_FLAGS := -ldflags "-extldflags '-lcudart -lcuda -lcublas'"
endif
ifeq ($(UNAME_S),Darwin)
EXT_LDFLAGS := -framework Foundation -framework Metal -framework MetalKit
endif

View File

@ -62,12 +62,6 @@ This will compile a static `libwhisper.a` in a `build` folder, download a model
make examples
```
To build using cuda support add `GGML_CUDA=1`:
```bash
GGML_CUDA=1 make examples
```
The examples are placed in the `build` directory. Once built, you can download all the models with the following command:
```bash

View File

@ -24,7 +24,7 @@ const (
var (
// The models which will be downloaded, if no model is specified as an argument
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3", "large-v3-turbo"}
modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large-v3"}
)
var (

View File

@ -1,10 +1,10 @@
module github.com/ggerganov/whisper.cpp/bindings/go
go 1.23
go 1.19
require (
github.com/go-audio/wav v1.1.0
github.com/stretchr/testify v1.9.0
github.com/stretchr/testify v1.8.1
)
require (

View File

@ -1,3 +1,4 @@
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
@ -8,9 +9,15 @@ github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk=
github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=

View File

@ -119,28 +119,6 @@ func (p *Params) SetAudioCtx(n int) {
p.audio_ctx = C.int(n)
}
func (p *Params) SetMaxContext(n int) {
p.n_max_text_ctx = C.int(n)
}
func (p *Params) SetBeamSize(n int) {
p.beam_search.beam_size = C.int(n)
}
func (p *Params) SetEntropyThold(t float32) {
p.entropy_thold = C.float(t)
}
func (p *Params) SetTemperature(t float32) {
p.temperature = C.float(t)
}
// Sets the fallback temperature incrementation
// Pass -1.0 to disable this feature
func (p *Params) SetTemperatureFallback(t float32) {
p.temperature_inc = C.float(t)
}
// Set initial prompt
func (p *Params) SetInitialPrompt(prompt string) {
p.initial_prompt = C.CString(prompt)
@ -171,10 +149,6 @@ func (p *Params) String() string {
str += fmt.Sprintf(" duration_ms=%d", p.duration_ms)
str += fmt.Sprintf(" audio_ctx=%d", p.audio_ctx)
str += fmt.Sprintf(" initial_prompt=%s", C.GoString(p.initial_prompt))
str += fmt.Sprintf(" entropy_thold=%f", p.entropy_thold)
str += fmt.Sprintf(" temperature=%f", p.temperature)
str += fmt.Sprintf(" temperature_inc=%f", p.temperature_inc)
str += fmt.Sprintf(" beam_size=%d", p.beam_search.beam_size)
if p.translate {
str += " translate"
}

View File

@ -125,32 +125,6 @@ func (context *context) SetAudioCtx(n uint) {
context.params.SetAudioCtx(int(n))
}
// Set maximum number of text context tokens to store
func (context *context) SetMaxContext(n int) {
context.params.SetMaxContext(n)
}
// Set Beam Size
func (context *context) SetBeamSize(n int) {
context.params.SetBeamSize(n)
}
// Set Entropy threshold
func (context *context) SetEntropyThold(t float32) {
context.params.SetEntropyThold(t)
}
// Set Temperature
func (context *context) SetTemperature(t float32) {
context.params.SetTemperature(t)
}
// Set the fallback temperature incrementation
// Pass -1.0 to disable this feature
func (context *context) SetTemperatureFallback(t float32) {
context.params.SetTemperatureFallback(t)
}
// Set initial prompt
func (context *context) SetInitialPrompt(prompt string) {
context.params.SetInitialPrompt(prompt)

View File

@ -4,90 +4,52 @@ import (
"os"
"testing"
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
"github.com/go-audio/wav"
// Packages
whisper "github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
assert "github.com/stretchr/testify/assert"
)
func TestSetLanguage(t *testing.T) {
assert := assert.New(t)
const (
ModelPath = "../../models/ggml-tiny.bin"
SamplePath = "../../samples/jfk.wav"
)
func Test_Whisper_000(t *testing.T) {
assert := assert.New(t)
if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
t.Skip("Skipping test, model not found:", ModelPath)
}
if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
t.Skip("Skipping test, sample not found:", SamplePath)
}
// Load model
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
assert.NoError(model.Close())
t.Log("languages=", model.Languages())
}
func Test_Whisper_001(t *testing.T) {
assert := assert.New(t)
if _, err := os.Stat(ModelPath); os.IsNotExist(err) {
t.Skip("Skipping test, model not found:", ModelPath)
}
if _, err := os.Stat(SamplePath); os.IsNotExist(err) {
t.Skip("Skipping test, sample not found:", SamplePath)
}
// Load model
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
// Get context for decoding
ctx, err := model.NewContext()
assert.NoError(err)
assert.NotNil(ctx)
// This returns an error since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
err = context.SetLanguage("en")
assert.Error(err)
}
func TestContextModelIsMultilingual(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
isMultilingual := context.IsMultilingual()
// This returns false since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
assert.False(isMultilingual)
}
func TestLanguage(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
// This always returns en since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
expectedLanguage := "en"
actualLanguage := context.Language()
assert.Equal(expectedLanguage, actualLanguage)
}
func TestProcess(t *testing.T) {
assert := assert.New(t)
fh, err := os.Open(SamplePath)
assert.NoError(err)
defer fh.Close()
// Decode the WAV file - load the full buffer
dec := wav.NewDecoder(fh)
buf, err := dec.FullPCMBuffer()
assert.NoError(err)
assert.Equal(uint16(1), dec.NumChans)
data := buf.AsFloat32Buffer().Data
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
err = context.Process(data, nil, nil)
assert.NoError(err)
}

View File

@ -38,22 +38,17 @@ type Context interface {
IsMultilingual() bool // Return true if the model is multilingual.
Language() string // Get language
SetOffset(time.Duration) // Set offset
SetDuration(time.Duration) // Set duration
SetThreads(uint) // Set number of threads to use
SetSplitOnWord(bool) // Set split on word flag
SetTokenThreshold(float32) // Set timestamp token probability threshold
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
SetMaxSegmentLength(uint) // Set max segment length in characters
SetTokenTimestamps(bool) // Set token timestamps flag
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
SetAudioCtx(uint) // Set audio encoder context
SetMaxContext(n int) // Set maximum number of text context tokens to store
SetBeamSize(n int) // Set Beam Size
SetEntropyThold(t float32) // Set Entropy threshold
SetInitialPrompt(prompt string) // Set initial prompt
SetTemperature(t float32) // Set temperature
SetTemperatureFallback(t float32) // Set temperature incrementation
SetOffset(time.Duration) // Set offset
SetDuration(time.Duration) // Set duration
SetThreads(uint) // Set number of threads to use
SetSplitOnWord(bool) // Set split on word flag
SetTokenThreshold(float32) // Set timestamp token probability threshold
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
SetMaxSegmentLength(uint) // Set max segment length in characters
SetTokenTimestamps(bool) // Set token timestamps flag
SetMaxTokensPerSegment(uint) // Set max tokens per segment (0 = no limit)
SetAudioCtx(uint) // Set audio encoder context
SetInitialPrompt(prompt string) // Set initial prompt
// Process mono audio data and return any errors.
// If defined, newly generated segments are passed to the

View File

@ -1,91 +0,0 @@
package whisper_test
import (
"testing"
"github.com/ggerganov/whisper.cpp/bindings/go/pkg/whisper"
assert "github.com/stretchr/testify/assert"
)
func TestNew(t *testing.T) {
assert := assert.New(t)
t.Run("valid model path", func(t *testing.T) {
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
})
t.Run("invalid model path", func(t *testing.T) {
invalidModelPath := "invalid-model-path.bin"
model, err := whisper.New(invalidModelPath)
assert.Error(err)
assert.Nil(model)
})
}
func TestClose(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
err = model.Close()
assert.NoError(err)
}
func TestNewContext(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
context, err := model.NewContext()
assert.NoError(err)
assert.NotNil(context)
}
func TestIsMultilingual(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
isMultilingual := model.IsMultilingual()
// This returns false since
// the model 'models/ggml-small.en.bin'
// that is loaded is not multilingual
assert.False(isMultilingual)
}
func TestLanguages(t *testing.T) {
assert := assert.New(t)
model, err := whisper.New(ModelPath)
assert.NoError(err)
assert.NotNil(model)
defer model.Close()
expectedLanguages := []string{
"en", "zh", "de", "es", "ru", "ko", "fr", "ja", "pt", "tr", "pl",
"ca", "nl", "ar", "sv", "it", "id", "hi", "fi", "vi", "he", "uk",
"el", "ms", "cs", "ro", "da", "hu", "ta", "no", "th", "ur", "hr",
"bg", "lt", "la", "mi", "ml", "cy", "sk", "te", "fa", "lv", "bn",
"sr", "az", "sl", "kn", "et", "mk", "br", "eu", "is", "hy", "ne",
"mn", "bs", "kk", "sq", "sw", "gl", "mr", "pa", "si", "km", "sn",
"yo", "so", "af", "oc", "ka", "be", "tg", "sd", "gu", "am", "yi",
"lo", "uz", "fo", "ht", "ps", "tk", "nn", "mt", "sa", "lb", "my",
"bo", "tl", "mg", "as", "tt", "haw", "ln", "ha", "ba", "jw", "su",
}
actualLanguages := model.Languages()
assert.Equal(expectedLanguages, actualLanguages)
}

View File

@ -1,6 +0,0 @@
package whisper_test
const (
ModelPath = "../../models/ggml-small.en.bin"
SamplePath = "../../samples/jfk.wav"
)

View File

@ -9,7 +9,7 @@ import (
// CGO
/*
#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
#cgo LDFLAGS: -lwhisper -lm -lstdc++
#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
#include <whisper.h>
#include <stdlib.h>

View File

@ -1,6 +1,6 @@
{
"name": "whisper.cpp",
"version": "1.7.1",
"version": "1.6.2",
"description": "Whisper speech recognition",
"main": "whisper.js",
"scripts": {

View File

@ -1,3 +0,0 @@
LICENSE
pkg/
lib/whisper.*

View File

@ -1,111 +0,0 @@
whispercpp
==========
![whisper.cpp](https://user-images.githubusercontent.com/1991296/235238348-05d0f6a4-da44-4900-a1de-d0707e75b763.jpeg)
Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model.
Installation
------------
Install the gem and add to the application's Gemfile by executing:
$ bundle add whispercpp
If bundler is not being used to manage dependencies, install the gem by executing:
$ gem install whispercpp
Usage
-----
```ruby
require "whisper"
whisper = Whisper::Context.new("path/to/model.bin")
params = Whisper::Params.new
params.language = "en"
params.offset = 10_000
params.duration = 60_000
params.max_text_tokens = 300
params.translate = true
params.print_timestamps = false
params.prompt = "Initial prompt here."
whisper.transcribe("path/to/audio.wav", params) do |whole_text|
puts whole_text
end
```
### Preparing model ###
Use script to download model file(s):
```bash
git clone https://github.com/ggerganov/whisper.cpp.git
cd whisper.cpp
sh ./models/download-ggml-model.sh base.en
```
There are some types of models. See [models][] page for details.
### Preparing audio file ###
Currently, whisper.cpp accepts only 16-bit WAV files.
### API ###
Once `Whisper::Context#transcribe` called, you can retrieve segments by `#each_segment`:
```ruby
def format_time(time_ms)
sec, decimal_part = time_ms.divmod(1000)
min, sec = sec.divmod(60)
hour, min = min.divmod(60)
"%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
end
whisper.transcribe("path/to/audio.wav", params)
whisper.each_segment.with_index do |segment, index|
line = "[%{nth}: %{st} --> %{ed}] %{text}" % {
nth: index + 1,
st: format_time(segment.start_time),
ed: format_time(segment.end_time),
text: segment.text
}
line << " (speaker turned)" if segment.speaker_next_turn?
puts line
end
```
You can also add hook to params called on new segment:
```ruby
def format_time(time_ms)
sec, decimal_part = time_ms.divmod(1000)
min, sec = sec.divmod(60)
hour, min = min.divmod(60)
"%02d:%02d:%02d.%03d" % [hour, min, sec, decimal_part]
end
# Add hook before calling #transcribe
params.on_new_segment do |segment|
line = "[%{st} --> %{ed}] %{text}" % {
st: format_time(segment.start_time),
ed: format_time(segment.end_time),
text: segment.text
}
line << " (speaker turned)" if segment.speaker_next_turn?
puts line
end
whisper.transcribe("path/to/audio.wav", params)
```
[whisper.cpp]: https://github.com/ggerganov/whisper.cpp
[models]: https://github.com/ggerganov/whisper.cpp/tree/master/models

View File

@ -1,59 +1,12 @@
require 'rake/clean'
require "bundler/gem_tasks"
require "pathname"
require "yaml"
require "rake/testtask"
require 'rubygems/package'
extsources = YAML.load_file("extsources.yaml")
SOURCES = FileList[]
extsources.each do |src|
basename = src.pathmap("%f")
dest = basename == "LICENSE" ? basename : basename.pathmap("ext/%f")
file src
file dest => src do |t|
cp t.source, t.name
end
SOURCES.include dest
end
CLEAN.include SOURCES
CLEAN.include FileList[
"ext/*.o",
"ext/*.metal",
"ext/whisper.{so,bundle,dll}",
"ext/depend"
]
task build: SOURCES + FileList[
"ext/extconf.rb",
"ext/ruby_whisper.h",
"ext/ruby_whisper.cpp",
"whispercpp.gemspec",
]
directory "pkg"
CLOBBER.include "pkg"
TEST_MODEL = "../../models/ggml-base.en.bin"
LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
LIB_FILE = File.join("lib", LIB_NAME)
directory "lib"
task LIB_FILE => SOURCES + ["lib"] do |t|
Dir.chdir "ext" do
sh "ruby extconf.rb"
sh "make"
end
mv "ext/#{LIB_NAME}", t.name
end
CLEAN.include LIB_FILE
Rake::TestTask.new do |t|
t.test_files = FileList["tests/test_*.rb"]
end
task test: [TEST_MODEL, LIB_FILE]
file TEST_MODEL do
Dir.chdir "../.." do
sh "./models/download-ggml-model.sh base.en"
end
desc 'Build gem'
task :package do
spec_source = File.read File.join(File.dirname(__FILE__),'whispercpp.gemspec')
spec = nil
# see: http://gist.github.com/16215
Thread.new { spec = eval("#{spec_source}") }.join
spec.validate
Gem::Package.build(spec)
end

View File

@ -3,33 +3,7 @@ ggml.c
ggml.h
ggml-alloc.c
ggml-alloc.h
ggml-aarch64.c
ggml-aarch64.h
ggml-backend.cpp
ggml-backend-impl.h
ggml-backend.c
ggml-backend.h
ggml-common.h
ggml-cpu-impl.h
ggml-metal.m
ggml-metal.metal
ggml-metal-embed.metal
ggml-blas.cpp
ggml-cuda.h
ggml-impl.h
ggml-kompute.h
ggml-metal.h
ggml-opencl.h
ggml-quants.c
ggml-quants.h
ggml-sycl.h
ggml-vulkan.h
ggml-blas.h
get-flags.mk
whisper.bundle
whisper.cpp
whisper.h
dr_wav.h
depend
whisper.bundle
whisper.so
whisper.dll

View File

@ -1,4 +1,20 @@
require 'mkmf'
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.cpp')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','whisper-mel.hpp')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-impl.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-alloc.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend-impl.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-backend.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-common.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.h')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','ggml-quants.c')} .")
system("cp #{File.join(File.dirname(__FILE__),'..','..','..','examples','dr_wav.h')} .")
# need to use c++ compiler flags
$CXXFLAGS << ' -std=c++11'
@ -12,219 +28,4 @@ if enable_config('march-tune-native', false)
$CXXFLAGS << ' -march=native -mtune=native'
end
def with_disabling_unsupported_files
disabled_files = []
unless $GGML_METAL
disabled_files << 'ggml-metal.h' << 'ggml-metal.m'
end
unless $GGML_METAL_EMBED_LIBRARY
disabled_files << 'ggml-metal.metal'
end
unless $OBJ_ALL&.include? 'ggml-blas.o'
disabled_files << 'ggml-blas.h' << 'ggml-blas.cpp'
end
disabled_files.filter! {|file| File.exist? file}
disabled_files.each do |file|
File.rename file, "#{file}.disabled"
end
yield
disabled_files.each do |file|
File.rename "#{file}.disabled", file
end
end
if ENV['WHISPER_METAL']
$GGML_METAL ||= true
$DEPRECATE_WARNING ||= true
end
$UNAME_S = `uname -s`.chomp
$UNAME_P = `uname -p`.chomp
$UNAME_M = `uname -m`.chomp
if $UNAME_S == 'Darwin'
unless ENV['GGML_NO_METAL']
$GGML_METAL ||= true
end
$GGML_NO_OPENMP ||= true
end
if $GGML_METAL
$GGML_METAL_EMBED_LIBRARY = true
end
$MK_CPPFLAGS = ''
$MK_CFLAGS = '-std=c11 -fPIC'
$MK_CXXFLAGS = '-std=c++11 -fPIC'
$MK_NVCCFLAGS = '-std=c++11'
$MK_LDFLAGS = ''
$OBJ_GGML = ''
$OBJ_WHISPER = ''
$OBJ_COMMON = ''
$OBJ_SDL = ''
$MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
if $UNAME_S == 'Linux'
$MK_CPPFLAGS << ' -D_GNU_SOURCE'
end
if $UNAME_S == 'Darwin'
$MK_CPPFLAGS << ' -D_DARWIN_C_SOURCE'
end
if ENV['WHISPER_DEBUG']
$MK_CFLAGS << ' -O0 -g'
$MK_CXXFLAGS << ' -O0 -g'
$MK_LDFLAGS << ' -g'
$MK_NVCCFLAGS << ' -O0 -g'
else
$MK_CPPFLAGS << ' -DNDEBUG'
$MK_CFLAGS << ' -O3'
$MK_CXXFLAGS << ' -O3'
$MK_NVCCFLAGS << ' -O3'
end
$WARN_FLAGS =
' -Wall' <<
' -Wextra' <<
' -Wpedantic' <<
' -Wcast-qual' <<
' -Wno-unused-function'
$MK_CFLAGS <<
$WARN_FLAGS <<
' -Wshadow' <<
' -Wstrict-prototypes' <<
' -Wpointer-arith' <<
' -Wmissing-prototypes' <<
' -Werror=implicit-int' <<
' -Werror=implicit-function-declaration'
$MK_CXXFLAGS <<
$WARN_FLAGS <<
' -Wmissing-declarations' <<
' -Wmissing-noreturn'
unless `#{cc_command} #{$LDFLAGS} -Wl,-v 2>&1`.chomp.include? 'dyld-1015.7'
$MK_CPPFLAGS << ' -DHAVE_BUGGY_APPLE_LINKER'
end
if %w[Linux Darwin FreeBSD NetBSD OpenBSD Haiku].include? $UNAME_S
$MK_CFLAGS << ' -pthread'
$MK_CXXFLAGS << ' -pthread'
end
unless $_WIN32
$DSO_EXT = '.so'
else
$DSO_EXT = '.dll'
end
unless ENV['RISCV']
if %w[x86_64 i686 amd64].include? $UNAME_M
$HOST_CXXFLAGS ||= ''
$MK_CFLAGS << ' -march=native -mtune=native'
$HOST_CXXFLAGS << ' -march=native -mtune=native'
end
if $UNAME_M.match? /aarch64.*/
$MK_CFLAGS << ' -mcpu=native'
$MK_CXXFLAGS << ' -mcpu=native'
end
else
$MK_CFLAGS << ' -march=rv64gcv -mabi=lp64d'
$MK_CXXFLAGS << ' -march=rv64gcv -mabi=lp64d'
end
unless ENV['GGML_NO_ACCELERATE']
if $UNAME_S == 'Darwin'
$MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS'
$MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
$MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
$MK_LDFLAGS << ' -framework Accelerate'
$OBJ_GGML << ' ggml-blas.o'
end
end
if ENV['GGML_OPENBLAS']
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
$MK_LDFLAGS << " #{`pkg-config --libs openblas`}"
$OBJ_GGML << ' ggml-blas.o'
end
if ENV['GGML_OPENBLAS64']
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
$MK_LDFLAGS << " #{`pkg-config --libs openblas64`}"
$OBJ_GGML << ' ggml-blas.o'
end
if $GGML_METAL
$MK_CPPFLAGS << ' -DGGML_USE_METAL'
$MK_LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
$OBJ_GGML << ' ggml-metal.o'
if ENV['GGML_METAL_NDEBUG']
$MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
end
if $GGML_METAL_EMBED_LIBRARY
$MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
$OBJ_GGML << ' ggml-metal-embed.o'
end
end
$OBJ_GGML <<
' ggml.o' <<
' ggml-alloc.o' <<
' ggml-backend.o' <<
' ggml-quants.o' <<
' ggml-aarch64.o'
$OBJ_WHISPER <<
' whisper.o'
$OBJ_ALL = "#{$OBJ_GGML} #{$OBJ_WHISPER} #{$OBJ_COMMON} #{$OBJ_SDL}"
$CPPFLAGS = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
$CFLAGS = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
$BASE_CXXFLAGS = "#{$MK_CXXFLAGS} #{$CXXFLAGS}"
$CXXFLAGS = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
$NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
$LDFLAGS = "#{$MK_LDFLAGS} #{$LDFLAGS}"
if $GGML_METAL_EMBED_LIBRARY
File.write 'depend', "$(OBJS): $(OBJS) ggml-metal-embed.o\n"
end
with_disabling_unsupported_files do
create_makefile('whisper')
end
File.open 'Makefile', 'a' do |file|
file.puts 'include get-flags.mk'
if $GGML_METAL
if $GGML_METAL_EMBED_LIBRARY
# mkmf determines object files to compile dependent on existing *.{c,cpp,m} files
# but ggml-metal-embed.c doesn't exist on creating Makefile.
file.puts "objs := $(OBJS)"
file.puts "OBJS = $(objs) 'ggml-metal-embed.o'"
file.puts 'include metal-embed.mk'
end
end
end
create_makefile('whisper')

View File

@ -0,0 +1,141 @@
#pragma once
// ggml-backend internal header
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
//
// Backend buffer
//
// buffer type
typedef void * ggml_backend_buffer_type_context_t;
struct ggml_backend_buffer_type_i {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
};
struct ggml_backend_buffer_type {
struct ggml_backend_buffer_type_i iface;
ggml_backend_buffer_type_context_t context;
};
// buffer
typedef void * ggml_backend_buffer_context_t;
struct ggml_backend_buffer_i {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_t buffer);
void (*GGML_CALL free_buffer)(ggml_backend_buffer_t buffer);
void * (*GGML_CALL get_base) (ggml_backend_buffer_t buffer);
void (*GGML_CALL init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
void (*GGML_CALL set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*GGML_CALL get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*GGML_CALL cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer
void (*GGML_CALL clear) (ggml_backend_buffer_t buffer, uint8_t value);
void (*GGML_CALL reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
};
struct ggml_backend_buffer {
struct ggml_backend_buffer_i iface;
ggml_backend_buffer_type_t buft;
ggml_backend_buffer_context_t context;
size_t size;
enum ggml_backend_buffer_usage usage;
};
GGML_CALL ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface,
ggml_backend_buffer_context_t context,
size_t size);
// do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
// buffer that contains a collection of buffers
GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
//
// Backend
//
typedef void * ggml_backend_context_t;
struct ggml_backend_i {
const char * (*GGML_CALL get_name)(ggml_backend_t backend);
void (*GGML_CALL free)(ggml_backend_t backend);
// buffer allocation
ggml_backend_buffer_type_t (*GGML_CALL get_default_buffer_type)(ggml_backend_t backend);
// (optional) asynchronous tensor data access
void (*GGML_CALL set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*GGML_CALL get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*GGML_CALL cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations
void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan (not used currently)
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph with a plan
enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph without a plan (async)
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// check if the backend supports an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event);
void (*GGML_CALL event_record) (ggml_backend_event_t event);
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
};
struct ggml_backend {
ggml_guid_t guid;
struct ggml_backend_i iface;
ggml_backend_context_t context;
};
struct ggml_backend_event {
ggml_backend_t backend;
void * context;
};
//
// Backend registry
//
typedef ggml_backend_t (*GGML_CALL ggml_backend_init_fn)(const char * params, void * user_data);
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,233 @@
#pragma once
#include "ggml.h"
#include "ggml-alloc.h"
#ifdef __cplusplus
extern "C" {
#endif
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend_event * ggml_backend_event_t;
typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t;
//
// Backend buffer
//
// buffer type
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer
enum ggml_backend_buffer_usage {
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
};
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
//
// Backend
//
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
GGML_API const char * ggml_backend_name(ggml_backend_t backend);
GGML_API void ggml_backend_free(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// asynchronous copy
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
// events
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
//
// CPU backend
//
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
//
// Backend registry
//
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
GGML_API size_t ggml_backend_reg_get_count(void);
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
GGML_API const char * ggml_backend_reg_get_name(size_t i);
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
//
// Backend scheduler
//
// The backend scheduler allows for multiple backends to be used together
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
// The backends are selected based on:
// - the backend that supports the operation
// - the location of the pre-allocated tensors (e.g. the weights)
/*
Example usage:
// operations that use tensors allocated in a buffer with USAGE_WEIGHTS will be assigned
// preferrably to run on the same backend as the buffer
ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false);
// initialize buffers from a max size graph (optional)
reserve_graph = build_graph(sched, max_batch_size);
// manually assign nodes to a backend (optional, should not be needed in most cases)
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
ggml_backend_sched_set_tensor_backend(sched, node, backend_gpu);
ggml_backend_sched_reserve(sched, reserve_graph);
// compute
graph = build_graph(sched);
ggml_backend_sched_graph_compute(sched, graph);
// if there are graph inputs:
ggml_backend_sched_reset(sched);
ggml_backend_sched_alloc_graph(sched, graph);
ggml_backend_tensor_set(input_tensor, ...);
ggml_backend_sched_graph_compute(sched, graph);
}
*/
struct ggml_backend_sched;
typedef struct ggml_backend_sched * ggml_backend_sched_t;
// when ask == true, the scheduler wants to know if the user wants to observe this node
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
//
// when ask == false, the scheduler is passing the node tensor to the user for observation
// if the user returns false, the scheduler will cancel the graph compute
//
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
// Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
GGML_API int ggml_backend_sched_get_n_copies(ggml_backend_sched_t sched);
GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
// Reset all assignments and allocators - must be called before changing the node backends
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
// Set a callback to be called for each resulting node during graph compute
GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
//
// Utils
//
struct ggml_backend_graph_copy {
ggml_backend_buffer_t buffer;
struct ggml_context * ctx_allocated;
struct ggml_context * ctx_unallocated;
struct ggml_cgraph * graph;
};
// Copy a graph to a different backend
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
// Tensor initialization
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,43 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS"
#else
#define GGML_CUDA_NAME "CUDA"
#define GGML_CUBLAS_NAME "cuBLAS"
#endif
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_CUDA_MAX_DEVICES 16
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,272 @@
#pragma once
#include "ggml.h"
// GGML internal header
#include <assert.h>
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
#include <stddef.h>
#include <stdbool.h>
#include <string.h> // memcpy
#include <math.h> // fabsf
#ifdef __cplusplus
extern "C" {
#endif
// static_assert should be a #define, but if it's not,
// fall back to the _Static_assert C11 keyword.
// if C99 - static_assert is noop
// ref: https://stackoverflow.com/a/53923785/4039976
#ifndef __cplusplus
#ifndef static_assert
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
#define static_assert(cond, msg) _Static_assert(cond, msg)
#else
#define static_assert(cond, msg) struct global_scope_noop_trick
#endif
#endif
#endif
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
#ifndef __FMA__
#define __FMA__
#endif
#ifndef __F16C__
#define __F16C__
#endif
#endif
// __SSE3__ and __SSSE3__ are not defined in MSVC, but SSE3/SSSE3 are present when AVX/AVX2/AVX512 are available
#if defined(_MSC_VER) && (defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__))
#ifndef __SSE3__
#define __SSE3__
#endif
#ifndef __SSSE3__
#define __SSSE3__
#endif
#endif
// 16-bit float
// on Arm, we use __fp16
// on x86, we use uint16_t
#if defined(__ARM_NEON) && !defined(_MSC_VER)
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
//
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
//
#include <arm_neon.h>
typedef __fp16 ggml_fp16_internal_t;
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
ggml_fp16_internal_t tmp;
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
return (float)tmp;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
ggml_fp16_t res;
ggml_fp16_internal_t tmp = f;
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
return res;
}
#else
typedef uint16_t ggml_fp16_internal_t;
#ifdef __wasm_simd128__
#include <wasm_simd128.h>
#else
#ifdef __POWER9_VECTOR__
#include <altivec.h>
#undef bool
#define bool _Bool
#else
#if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h>
#else
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
#if !defined(__riscv)
#include <immintrin.h>
#endif
#endif
#endif
#endif
#endif
#ifdef __riscv_v_intrinsic
#include <riscv_vector.h>
#endif
#ifdef __F16C__
#ifdef _MSC_VER
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
#else
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
#endif
#elif defined(__POWER9_VECTOR__)
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
/* the inline asm below is about 12% faster than the lookup method */
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
register float f;
register double d;
__asm__(
"mtfprd %0,%2\n"
"xscvhpdp %0,%0\n"
"frsp %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=f"(f):
/* in */ "r"(h));
return f;
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
register double d;
register ggml_fp16_t r;
__asm__( /* xscvdphp can work on double or single precision */
"xscvdphp %0,%2\n"
"mffprd %1,%0\n" :
/* temp */ "=d"(d),
/* out */ "=r"(r):
/* in */ "f"(f));
return r;
}
#else
// FP16 <-> FP32
// ref: https://github.com/Maratyszcza/FP16
static inline float fp32_from_bits(uint32_t w) {
union {
uint32_t as_bits;
float as_value;
} fp32;
fp32.as_bits = w;
return fp32.as_value;
}
static inline uint32_t fp32_to_bits(float f) {
union {
float as_value;
uint32_t as_bits;
} fp32;
fp32.as_value = f;
return fp32.as_bits;
}
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
const uint32_t w = (uint32_t) h << 16;
const uint32_t sign = w & UINT32_C(0x80000000);
const uint32_t two_w = w + w;
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float exp_scale = 0x1.0p-112f;
#else
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
#endif
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
const uint32_t magic_mask = UINT32_C(126) << 23;
const float magic_bias = 0.5f;
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
const uint32_t result = sign |
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
return fp32_from_bits(result);
}
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
const float scale_to_inf = 0x1.0p+112f;
const float scale_to_zero = 0x1.0p-110f;
#else
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
#endif
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
const uint32_t w = fp32_to_bits(f);
const uint32_t shl1_w = w + w;
const uint32_t sign = w & UINT32_C(0x80000000);
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
if (bias < UINT32_C(0x71000000)) {
bias = UINT32_C(0x71000000);
}
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
const uint32_t bits = fp32_to_bits(base);
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
const uint32_t nonsign = exp_bits + mantissa_bits;
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
}
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
#endif // __F16C__
#endif // __ARM_NEON
// precomputed f32 table for f16 (256 KB)
// defined in ggml.c, initialized in ggml_init()
extern float ggml_table_f32_f16[1 << 16];
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
// This is also true for POWER9.
#if !defined(GGML_FP16_TO_FP32)
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
uint16_t s;
memcpy(&s, &f, sizeof(uint16_t));
return ggml_table_f32_f16[s];
}
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
#endif
#if !defined(GGML_FP32_TO_FP16)
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
#endif
#define GGML_HASHTABLE_FULL ((size_t)-1)
#define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2)
struct ggml_hash_set ggml_hash_set_new(size_t size);
bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
// returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted
size_t ggml_hash_find (const struct ggml_hash_set hash_set, struct ggml_tensor * key);
// returns GGML_HASHTABLE_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
size_t ggml_hash_insert ( struct ggml_hash_set hash_set, struct ggml_tensor * key);
// return index, asserts if table is full
size_t ggml_hash_find_or_insert( struct ggml_hash_set hash_set, struct ggml_tensor * key);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,46 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
struct ggml_vk_device {
int index;
int type; // same as VkPhysicalDeviceType
size_t heapSize;
const char * name;
const char * vendor;
int subgroupSize;
uint64_t bufferAlignment;
uint64_t maxAlloc;
};
struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count);
bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name);
bool ggml_vk_has_vulkan(void);
bool ggml_vk_has_device(void);
struct ggml_vk_device ggml_vk_current_device(void);
//
// backend API
//
// forward declaration
typedef struct ggml_backend * ggml_backend_t;
GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,66 @@
// An interface allowing to compute ggml_cgraph with Metal
//
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
// A similar interface can be created for other GPU backends (e.g. Vulkan, CUDA, OpenCL, etc.)
//
// How it works?
//
// As long as your program can create and evaluate a ggml_cgraph on the CPU, you can use this
// interface to evaluate the same graph on the GPU. Instead of using ggml_graph_compute(), you
// use ggml_metal_graph_compute() (or ggml_vulkan_graph_compute(), etc.)
//
// You only need to make sure that all memory buffers that you used during the graph creation
// are mapped to the device memory with the ggml_metal_add_buffer() function. This mapping is
// used during the graph evaluation to determine the arguments of the compute kernels.
//
// Synchronization between device and host memory (for example for input and output tensors)
// is done with the ggml_metal_set_tensor() and ggml_metal_get_tensor() functions.
//
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include <stddef.h>
#include <stdbool.h>
// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 64
struct ggml_tensor;
struct ggml_cgraph;
#ifdef __cplusplus
extern "C" {
#endif
//
// backend API
// user-code should use only these functions
//
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
GGML_API ggml_backend_t ggml_backend_metal_init(void);
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family
// ideally, the user code should be doing these checks
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,36 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
GGML_API void ggml_cl_init(void);
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cl_add(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst);
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
// GGML_API void * ggml_cl_host_malloc(size_t size);
// GGML_API void ggml_cl_host_free(void * ptr);
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
// backend API
// GGML_API ggml_backend_t ggml_backend_opencl_init(void);
// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void);
// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void);
#ifdef __cplusplus
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,133 @@
#pragma once
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml.h"
// GGML internal header
#ifdef __cplusplus
extern "C" {
#endif
// Quantization
void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int64_t k);
void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_xxs_reference(const float * GGML_RESTRICT x, block_iq3_xxs * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_nl_reference (const float * GGML_RESTRICT x, block_iq4_nl * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_xs_reference (const float * GGML_RESTRICT x, block_iq4_xs * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_s_reference (const float * GGML_RESTRICT x, block_iq3_s * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s_reference (const float * GGML_RESTRICT x, block_iq2_s * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_xxs(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq3_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_row_iq2_s (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
// Dequantization
void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq2_xxs(const block_iq2_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq2_xs (const block_iq2_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq2_s (const block_iq2_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq3_xxs(const block_iq3_xxs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq1_s (const block_iq1_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq1_m (const block_iq1_m * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq4_nl (const block_iq4_nl * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq4_xs (const block_iq4_xs * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
void dequantize_row_iq3_s (const block_iq3_s * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
// Dot product
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_iq2_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq2_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq2_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq3_xxs(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq1_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq1_m (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq4_nl (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq4_xs (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_iq3_s (const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q6_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q5_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q5_1(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q8_0(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
void iq2xs_init_impl(enum ggml_type type);
void iq2xs_free_impl(enum ggml_type type);
void iq3xs_init_impl(int grid_size);
void iq3xs_free_impl(int grid_size);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,49 @@
//
// MIT license
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: MIT
//
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_SYCL_MAX_DEVICES 48
#define GGML_SYCL_NAME "SYCL"
// backend API
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
// devide buffer
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id);
// TODO: these are temporary
// ref: https://github.com/ggerganov/llama.cpp/pull/6022#issuecomment-1992615670
GGML_API GGML_CALL int ggml_backend_sycl_get_device_id(int device_index);
GGML_API GGML_CALL void ggml_backend_sycl_set_single_device_mode(int main_gpu_id);
GGML_API GGML_CALL void ggml_backend_sycl_set_mul_device_mode();
// SYCL doesn't support registering host memory, keep here for reference
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,29 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_VK_NAME "Vulkan"
#define GGML_VK_MAX_DEVICES 16
GGML_API void ggml_vk_instance_init(void);
// backend API
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
#ifdef __cplusplus
}
#endif

View File

@ -1,14 +0,0 @@
ggml-metal-embed.o: \
ggml-metal.metal \
ggml-common.h
@echo "Embedding Metal library"
@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
$(eval TEMP_ASSEMBLY=$(shell mktemp))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
@$(AS) $(TEMP_ASSEMBLY) -o $@
@rm -f ${TEMP_ASSEMBLY}

File diff suppressed because it is too large Load Diff

View File

@ -3,13 +3,6 @@
#include "whisper.h"
typedef struct {
VALUE *context;
VALUE user_data;
VALUE callback;
VALUE callbacks;
} ruby_whisper_callback_container;
typedef struct {
struct whisper_context *context;
} ruby_whisper;
@ -17,9 +10,6 @@ typedef struct {
typedef struct {
struct whisper_full_params params;
bool diarize;
ruby_whisper_callback_container *new_segment_callback_container;
ruby_whisper_callback_container *progress_callback_container;
ruby_whisper_callback_container *abort_callback_container;
} ruby_whisper_params;
#endif

View File

@ -1,29 +0,0 @@
---
- ../../src/whisper.cpp
- ../../include/whisper.h
- ../../ggml/src/ggml.c
- ../../ggml/src/ggml-impl.h
- ../../ggml/src/ggml-aarch64.h
- ../../ggml/src/ggml-aarch64.c
- ../../ggml/src/ggml-alloc.c
- ../../ggml/src/ggml-backend-impl.h
- ../../ggml/src/ggml-backend.cpp
- ../../ggml/src/ggml-common.h
- ../../ggml/src/ggml-quants.h
- ../../ggml/src/ggml-quants.c
- ../../ggml/src/ggml-cpu-impl.h
- ../../ggml/src/ggml-metal.m
- ../../ggml/src/ggml-metal.metal
- ../../ggml/src/ggml-blas.cpp
- ../../ggml/include/ggml.h
- ../../ggml/include/ggml-alloc.h
- ../../ggml/include/ggml-backend.h
- ../../ggml/include/ggml-cuda.h
- ../../ggml/include/ggml-kompute.h
- ../../ggml/include/ggml-metal.h
- ../../ggml/include/ggml-sycl.h
- ../../ggml/include/ggml-vulkan.h
- ../../ggml/include/ggml-blas.h
- ../../scripts/get-flags.mk
- ../../examples/dr_wav.h
- ../../LICENSE

View File

@ -1,163 +0,0 @@
require "test/unit"
require "whisper"
class TestCallback < Test::Unit::TestCase
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
def setup
GC.start
@params = Whisper::Params.new
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
@audio = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
end
def test_new_segment_callback
@params.new_segment_callback = ->(context, state, n_new, user_data) {
assert_kind_of Integer, n_new
assert n_new > 0
assert_same @whisper, context
n_segments = context.full_n_segments
n_new.times do |i|
i_segment = n_segments - 1 + i
start_time = context.full_get_segment_t0(i_segment) * 10
end_time = context.full_get_segment_t1(i_segment) * 10
text = context.full_get_segment_text(i_segment)
assert_kind_of Integer, start_time
assert start_time >= 0
assert_kind_of Integer, end_time
assert end_time > 0
assert_match /ask not what your country can do for you, ask what you can do for your country/, text if i_segment == 0
end
}
@whisper.transcribe(@audio, @params)
end
def test_new_segment_callback_closure
search_word = "what"
@params.new_segment_callback = ->(context, state, n_new, user_data) {
n_segments = context.full_n_segments
n_new.times do |i|
i_segment = n_segments - 1 + i
text = context.full_get_segment_text(i_segment)
if text.include?(search_word)
t0 = context.full_get_segment_t0(i_segment)
t1 = context.full_get_segment_t1(i_segment)
raise "search word '#{search_word}' found at between #{t0} and #{t1}"
end
end
}
assert_raise RuntimeError do
@whisper.transcribe(@audio, @params)
end
end
def test_new_segment_callback_user_data
udata = Object.new
@params.new_segment_callback_user_data = udata
@params.new_segment_callback = ->(context, state, n_new, user_data) {
assert_same udata, user_data
}
@whisper.transcribe(@audio, @params)
end
def test_new_segment_callback_user_data_gc
@params.new_segment_callback_user_data = "My user data"
@params.new_segment_callback = ->(context, state, n_new, user_data) {
assert_equal "My user data", user_data
}
GC.start
assert_same @whisper, @whisper.transcribe(@audio, @params)
end
def test_progress_callback
first = nil
last = nil
@params.progress_callback = ->(context, state, progress, user_data) {
assert_kind_of Integer, progress
assert 0 <= progress && progress <= 100
assert_same @whisper, context
first = progress if first.nil?
last = progress
}
@whisper.transcribe(@audio, @params)
assert_equal 0, first
assert_equal 100, last
end
def test_progress_callback_user_data
udata = Object.new
@params.progress_callback_user_data = udata
@params.progress_callback = ->(context, state, n_new, user_data) {
assert_same udata, user_data
}
@whisper.transcribe(@audio, @params)
end
def test_on_progress
first = nil
last = nil
@params.on_progress do |progress|
assert_kind_of Integer, progress
assert 0 <= progress && progress <= 100
first = progress if first.nil?
last = progress
end
@whisper.transcribe(@audio, @params)
assert_equal 0, first
assert_equal 100, last
end
def test_abort_callback
i = 0
@params.abort_callback = ->(user_data) {
assert_nil user_data
i += 1
return false
}
@whisper.transcribe(@audio, @params)
assert i > 0
end
def test_abort_callback_abort
i = 0
@params.abort_callback = ->(user_data) {
i += 1
return i == 3
}
@whisper.transcribe(@audio, @params)
assert_equal 3, i
end
def test_abort_callback_user_data
udata = Object.new
@params.abort_callback_user_data = udata
yielded = nil
@params.abort_callback = ->(user_data) {
yielded = user_data
}
@whisper.transcribe(@audio, @params)
assert_same udata, yielded
end
def test_abort_on
do_abort = false
aborted_from_callback = false
@params.on_new_segment do |segment|
do_abort = true if segment.text.match? /ask/
end
i = 0
@params.abort_on do
i += 1
do_abort
end
@whisper.transcribe(@audio, @params)
assert i > 0
end
end

View File

@ -1,31 +0,0 @@
require 'test/unit'
require 'tempfile'
require 'tmpdir'
require 'shellwords'
class TestPackage < Test::Unit::TestCase
def test_build
Tempfile.create do |file|
assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)
assert file.size > 0
assert_path_exist file.to_path
end
end
sub_test_case "Building binary on installation" do
def setup
system "rake", "build", exception: true
end
def test_install
match_data = `rake -Tbuild`.match(/(whispercpp-(.+)\.gem)/)
filename = match_data[1]
version = match_data[2]
basename = "whisper.#{RbConfig::CONFIG["DLEXT"]}"
Dir.mktmpdir do |dir|
system "gem", "install", "--install-dir", dir.shellescape, "pkg/#{filename.shellescape}", exception: true
assert_path_exist File.join(dir, "gems/whispercpp-#{version}/lib", basename)
end
end
end
end

View File

@ -1,155 +0,0 @@
require 'test/unit'
require 'whisper'
class TestParams < Test::Unit::TestCase
def setup
@params = Whisper::Params.new
end
def test_language
@params.language = "en"
assert_equal @params.language, "en"
@params.language = "auto"
assert_equal @params.language, "auto"
end
def test_offset
@params.offset = 10_000
assert_equal @params.offset, 10_000
@params.offset = 0
assert_equal @params.offset, 0
end
def test_duration
@params.duration = 60_000
assert_equal @params.duration, 60_000
@params.duration = 0
assert_equal @params.duration, 0
end
def test_max_text_tokens
@params.max_text_tokens = 300
assert_equal @params.max_text_tokens, 300
@params.max_text_tokens = 0
assert_equal @params.max_text_tokens, 0
end
def test_translate
@params.translate = true
assert @params.translate
@params.translate = false
assert !@params.translate
end
def test_no_context
@params.no_context = true
assert @params.no_context
@params.no_context = false
assert !@params.no_context
end
def test_single_segment
@params.single_segment = true
assert @params.single_segment
@params.single_segment = false
assert !@params.single_segment
end
def test_print_special
@params.print_special = true
assert @params.print_special
@params.print_special = false
assert !@params.print_special
end
def test_print_progress
@params.print_progress = true
assert @params.print_progress
@params.print_progress = false
assert !@params.print_progress
end
def test_print_realtime
@params.print_realtime = true
assert @params.print_realtime
@params.print_realtime = false
assert !@params.print_realtime
end
def test_print_timestamps
@params.print_timestamps = true
assert @params.print_timestamps
@params.print_timestamps = false
assert !@params.print_timestamps
end
def test_suppress_blank
@params.suppress_blank = true
assert @params.suppress_blank
@params.suppress_blank = false
assert !@params.suppress_blank
end
def test_suppress_non_speech_tokens
@params.suppress_non_speech_tokens = true
assert @params.suppress_non_speech_tokens
@params.suppress_non_speech_tokens = false
assert !@params.suppress_non_speech_tokens
end
def test_token_timestamps
@params.token_timestamps = true
assert @params.token_timestamps
@params.token_timestamps = false
assert !@params.token_timestamps
end
def test_split_on_word
@params.split_on_word = true
assert @params.split_on_word
@params.split_on_word = false
assert !@params.split_on_word
end
def test_initial_prompt
assert_nil @params.initial_prompt
@params.initial_prompt = "You are a polite person."
assert_equal "You are a polite person.", @params.initial_prompt
end
def test_temperature
assert_equal 0.0, @params.temperature
@params.temperature = 0.5
assert_equal 0.5, @params.temperature
end
def test_max_initial_ts
assert_equal 1.0, @params.max_initial_ts
@params.max_initial_ts = 600.0
assert_equal 600.0, @params.max_initial_ts
end
def test_length_penalty
assert_equal -1.0, @params.length_penalty
@params.length_penalty = 0.5
assert_equal 0.5, @params.length_penalty
end
def test_temperature_inc
assert_in_delta 0.2, @params.temperature_inc
@params.temperature_inc = 0.5
assert_in_delta 0.5, @params.temperature_inc
end
def test_entropy_thold
assert_in_delta 2.4, @params.entropy_thold
@params.entropy_thold = 3.0
assert_in_delta 3.0, @params.entropy_thold
end
def test_logprob_thold
assert_in_delta -1.0, @params.logprob_thold
@params.logprob_thold = -0.5
assert_in_delta -0.5, @params.logprob_thold
end
end

View File

@ -1,87 +0,0 @@
require "test/unit"
require "whisper"
class TestSegment < Test::Unit::TestCase
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
class << self
attr_reader :whisper
def startup
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new
params.print_timestamps = false
jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
@whisper.transcribe(jfk, params)
end
end
def test_iteration
whisper.each_segment do |segment|
assert_instance_of Whisper::Segment, segment
end
end
def test_enumerator
enum = whisper.each_segment
assert_instance_of Enumerator, enum
enum.to_a.each_with_index do |segment, index|
assert_instance_of Whisper::Segment, segment
assert_kind_of Integer, index
end
end
def test_start_time
i = 0
whisper.each_segment do |segment|
assert_equal 0, segment.start_time if i == 0
i += 1
end
end
def test_end_time
i = 0
whisper.each_segment do |segment|
assert_equal whisper.full_get_segment_t1(i) * 10, segment.end_time
i += 1
end
end
def test_on_new_segment
params = Whisper::Params.new
seg = nil
index = 0
params.on_new_segment do |segment|
assert_instance_of Whisper::Segment, segment
if index == 0
seg = segment
assert_equal 0, segment.start_time
assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
end
index += 1
end
whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
assert_equal 0, seg.start_time
assert_match /ask not what your country can do for you, ask what you can do for your country/, seg.text
end
def test_on_new_segment_twice
params = Whisper::Params.new
seg = nil
params.on_new_segment do |segment|
seg = segment
return
end
params.on_new_segment do |segment|
assert_same seg, segment
return
end
whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
end
private
def whisper
self.class.whisper
end
end

View File

@ -1,13 +1,122 @@
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
EXTDIR = File.join(TOPDIR, 'ext')
#$LIBDIR = File.join(TOPDIR, 'lib')
#$:.unshift(LIBDIR)
$:.unshift(EXTDIR)
require 'whisper'
require 'test/unit'
class TestWhisper < Test::Unit::TestCase
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
def setup
@params = Whisper::Params.new
end
def test_language
@params.language = "en"
assert_equal @params.language, "en"
@params.language = "auto"
assert_equal @params.language, "auto"
end
def test_offset
@params.offset = 10_000
assert_equal @params.offset, 10_000
@params.offset = 0
assert_equal @params.offset, 0
end
def test_duration
@params.duration = 60_000
assert_equal @params.duration, 60_000
@params.duration = 0
assert_equal @params.duration, 0
end
def test_max_text_tokens
@params.max_text_tokens = 300
assert_equal @params.max_text_tokens, 300
@params.max_text_tokens = 0
assert_equal @params.max_text_tokens, 0
end
def test_translate
@params.translate = true
assert @params.translate
@params.translate = false
assert !@params.translate
end
def test_no_context
@params.no_context = true
assert @params.no_context
@params.no_context = false
assert !@params.no_context
end
def test_single_segment
@params.single_segment = true
assert @params.single_segment
@params.single_segment = false
assert !@params.single_segment
end
def test_print_special
@params.print_special = true
assert @params.print_special
@params.print_special = false
assert !@params.print_special
end
def test_print_progress
@params.print_progress = true
assert @params.print_progress
@params.print_progress = false
assert !@params.print_progress
end
def test_print_realtime
@params.print_realtime = true
assert @params.print_realtime
@params.print_realtime = false
assert !@params.print_realtime
end
def test_print_timestamps
@params.print_timestamps = true
assert @params.print_timestamps
@params.print_timestamps = false
assert !@params.print_timestamps
end
def test_suppress_blank
@params.suppress_blank = true
assert @params.suppress_blank
@params.suppress_blank = false
assert !@params.suppress_blank
end
def test_suppress_non_speech_tokens
@params.suppress_non_speech_tokens = true
assert @params.suppress_non_speech_tokens
@params.suppress_non_speech_tokens = false
assert !@params.suppress_non_speech_tokens
end
def test_token_timestamps
@params.token_timestamps = true
assert @params.token_timestamps
@params.token_timestamps = false
assert !@params.token_timestamps
end
def test_split_on_word
@params.split_on_word = true
assert @params.split_on_word
@params.split_on_word = false
assert !@params.split_on_word
end
def test_whisper
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new
@ -19,81 +128,4 @@ class TestWhisper < Test::Unit::TestCase
}
end
sub_test_case "After transcription" do
class << self
attr_reader :whisper
def startup
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new
params.print_timestamps = false
jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
@whisper.transcribe(jfk, params)
end
end
def whisper
self.class.whisper
end
def test_full_n_segments
assert_equal 1, whisper.full_n_segments
end
def test_full_lang_id
assert_equal 0, whisper.full_lang_id
end
def test_full_get_segment_t0
assert_equal 0, whisper.full_get_segment_t0(0)
assert_raise IndexError do
whisper.full_get_segment_t0(whisper.full_n_segments)
end
assert_raise IndexError do
whisper.full_get_segment_t0(-1)
end
end
def test_full_get_segment_t1
t1 = whisper.full_get_segment_t1(0)
assert_kind_of Integer, t1
assert t1 > 0
assert_raise IndexError do
whisper.full_get_segment_t1(whisper.full_n_segments)
end
end
def test_full_get_segment_speaker_turn_next
assert_false whisper.full_get_segment_speaker_turn_next(0)
end
def test_full_get_segment_text
assert_match /ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0)
end
end
def test_lang_max_id
assert_kind_of Integer, Whisper.lang_max_id
end
def test_lang_id
assert_equal 0, Whisper.lang_id("en")
assert_raise ArgumentError do
Whisper.lang_id("non existing language")
end
end
def test_lang_str
assert_equal "en", Whisper.lang_str(0)
assert_raise IndexError do
Whisper.lang_str(Whisper.lang_max_id + 1)
end
end
def test_lang_str_full
assert_equal "english", Whisper.lang_str_full(0)
assert_raise IndexError do
Whisper.lang_str_full(Whisper.lang_max_id + 1)
end
end
end

View File

@ -1,5 +1,3 @@
require "yaml"
Gem::Specification.new do |s|
s.name = "whispercpp"
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
@ -9,16 +7,10 @@ Gem::Specification.new do |s|
s.email = 'todd.fisher@gmail.com'
s.extra_rdoc_files = ['LICENSE', 'README.md']
s.files = `git ls-files . -z`.split("\x0") +
YAML.load_file("extsources.yaml").collect {|file|
basename = File.basename(file)
if s.extra_rdoc_files.include?(basename)
basename
else
File.join("ext", basename)
end
}
s.files = ["LICENSE", "README.md", "Rakefile", "ext/extconf.rb", "ext/ggml.c", "ext/ruby_whisper.cpp", "ext/whisper.cpp", "ext/dr_wav.h", "ext/ggml.h", "ext/ruby_whisper.h", "ext/whisper.h"]
#### Load-time details
s.require_paths = ['lib','ext']
s.summary = %q{Ruby whisper.cpp bindings}
s.test_files = ["tests/test_whisper.rb"]

View File

@ -13,5 +13,5 @@ set_target_properties(${TARGET}
PROPERTIES
EXPORT_COMPILE_COMMANDS ON
RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin"
INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib"
INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib"
)

View File

@ -36,7 +36,7 @@ include(FindPackageHandleStandardArgs)
# The default components were taken from a survey over other FindFFMPEG.cmake files
if (NOT FFmpeg_FIND_COMPONENTS)
set(FFmpeg_FIND_COMPONENTS AVFORMAT AVCODEC AVUTIL SWRESAMPLE)
set(FFmpeg_FIND_COMPONENTS AVFORMAT AVCODEC AVUTIL SWRESAMPLE)
endif()
#
@ -84,7 +84,7 @@ macro(find_component _component _pkgconfig _library _header)
# CMake's default is to search first for shared libraries and then for static libraries.
# Todo later: add option to prefer static libs over dynamic:
find_library(${_component}_LIBRARIES NAMES ${_library} lib${_library}.a
find_library(${_component}_LIBRARIES NAMES ${_library} lib${_library}.a
HINTS
${PC_${_component}_LIBDIR}
${PC_${_component}_LIBRARY_DIRS}

View File

@ -1,7 +1,7 @@
set(WHISPER_VERSION @WHISPER_INSTALL_VERSION@)
set(WHISPER_BUILD_COMMIT @WHISPER_BUILD_COMMIT@)
set(WHISPER_BUILD_NUMBER @WHISPER_BUILD_NUMBER@)
set(WHISPER_SHARED_LIB @BUILD_SHARED_LIBS@)
set(LLAMA_VERSION @LLAMA_INSTALL_VERSION@)
set(LLAMA_BUILD_COMMIT @LLAMA_BUILD_COMMIT@)
set(LLAMA_BUILD_NUMBER @LLAMA_BUILD_NUMBER@)
set(LLAMA_SHARED_LIB @BUILD_SHARED_LIBS@)
set(GGML_BLAS @GGML_BLAS@)
set(GGML_CUDA @GGML_CUDA@)
@ -11,9 +11,9 @@ set(GGML_ACCELERATE @GGML_ACCELERATE@)
@PACKAGE_INIT@
set_and_check(WHISPER_INCLUDE_DIR "@PACKAGE_WHISPER_INCLUDE_INSTALL_DIR@")
set_and_check(WHISPER_LIB_DIR "@PACKAGE_WHISPER_LIB_INSTALL_DIR@")
set_and_check(WHISPER_BIN_DIR "@PACKAGE_WHISPER_BIN_INSTALL_DIR@")
set_and_check(LLAMA_INCLUDE_DIR "@PACKAGE_LLAMA_INCLUDE_INSTALL_DIR@")
set_and_check(LLAMA_LIB_DIR "@PACKAGE_LLAMA_LIB_INSTALL_DIR@")
set_and_check(LLAMA_BIN_DIR "@PACKAGE_LLAMA_BIN_INSTALL_DIR@")
# Ensure transient dependencies satisfied
@ -43,23 +43,23 @@ if (GGML_HIPBLAS)
find_package(rocblas REQUIRED)
endif()
find_library(whisper_LIBRARY whisper
find_library(llama_LIBRARY llama
REQUIRED
HINTS ${WHISPER_LIB_DIR})
HINTS ${LLAMA_LIB_DIR})
set(_whisper_link_deps "Threads::Threads" "@WHISPER_EXTRA_LIBS@")
set(_whisper_transient_defines "@WHISPER_TRANSIENT_DEFINES@")
set(_llama_link_deps "Threads::Threads" "@LLAMA_EXTRA_LIBS@")
set(_llama_transient_defines "@LLAMA_TRANSIENT_DEFINES@")
add_library(whisper UNKNOWN IMPORTED)
add_library(llama UNKNOWN IMPORTED)
set_target_properties(whisper
set_target_properties(llama
PROPERTIES
INTERFACE_INCLUDE_DIRECTORIES "${WHISPER_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "${_whisper_link_deps}"
INTERFACE_COMPILE_DEFINITIONS "${_whisper_transient_defines}"
INTERFACE_INCLUDE_DIRECTORIES "${LLAMA_INCLUDE_DIR}"
INTERFACE_LINK_LIBRARIES "${_llama_link_deps}"
INTERFACE_COMPILE_DEFINITIONS "${_llama_transient_defines}"
IMPORTED_LINK_INTERFACE_LANGUAGES "CXX"
IMPORTED_LOCATION "${whisper_LIBRARY}"
IMPORTED_LOCATION "${llama_LIBRARY}"
INTERFACE_COMPILE_FEATURES cxx_std_11
POSITION_INDEPENDENT_CODE ON )
check_required_components(whisper)
check_required_components(Llama)

View File

@ -1,6 +1,6 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix}
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
libdir=${exec_prefix}/lib
includedir=${prefix}/include
Name: whisper

View File

@ -40,7 +40,7 @@ if (WHISPER_FFMPEG)
message(STATUS "Found ffmpeg libs: ${FFMPEG_LIBRARIES}")
message(STATUS "Found ffmpeg headers in: ${FFMPEG_INCLUDE_DIRS}")
message(STATUS "ffmpeg definitions: ${FFMPEG_DEFINITIONS}")
message(STATUS "Found avformat ${AVFORMAT_VERSION}")
message(STATUS "Found avformat ${AVFORMAT_VERSION}")
include_directories(${FFMPEG_INCLUDE_DIRS})
add_compile_definitions(WHISPER_FFMPEG)
@ -102,8 +102,8 @@ if (EMSCRIPTEN)
set_target_properties(libstream PROPERTIES FOLDER "libs")
add_subdirectory(command.wasm)
set_target_properties(libcommand PROPERTIES FOLDER "libs")
#add_subdirectory(talk.wasm)
#set_target_properties(libtalk PROPERTIES FOLDER "libs")
add_subdirectory(talk.wasm)
set_target_properties(libtalk PROPERTIES FOLDER "libs")
add_subdirectory(bench.wasm)
set_target_properties(libbench PROPERTIES FOLDER "libs")
elseif(CMAKE_JS_VERSION)
@ -127,10 +127,8 @@ endif (WHISPER_SDL2)
add_subdirectory(quantize)
set_target_properties(quantize PROPERTIES FOLDER "examples")
if (WHISPER_SDL2)
# TODO: disabled until update
# https://github.com/ggerganov/whisper.cpp/issues/1818
#add_subdirectory(talk)
#set_target_properties(talk PROPERTIES FOLDER "examples")
add_subdirectory(talk)
set_target_properties(talk PROPERTIES FOLDER "examples")
add_subdirectory(talk-llama)
set_target_properties(talk-llama PROPERTIES FOLDER "examples")
add_subdirectory(lsp)

View File

@ -72,9 +72,6 @@ bool ggml_common_quantize_0(
case GGML_FTYPE_MOSTLY_IQ4_XS:
case GGML_FTYPE_MOSTLY_IQ1_M:
case GGML_FTYPE_MOSTLY_BF16:
case GGML_FTYPE_MOSTLY_Q4_0_4_4:
case GGML_FTYPE_MOSTLY_Q4_0_4_8:
case GGML_FTYPE_MOSTLY_Q4_0_8_8:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
@ -212,11 +209,6 @@ bool ggml_common_quantize_0(
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0_4_4:
case GGML_TYPE_Q4_0_4_8:
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));

View File

@ -147,6 +147,7 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
case 7: return "He";
case 8: return "She";
case 9: return "They";
default: return "To";
}
return "The";

View File

@ -9,7 +9,6 @@
#include <thread>
#include <ctime>
#include <fstream>
#include <sstream>
#define COMMON_SAMPLE_RATE 16000
@ -287,43 +286,12 @@ void sam_print_usage(int argc, char ** argv, const sam_params & params);
// Terminal utils
//
#define SQR(X) ((X) * (X))
#define UNCUBE(x) x < 48 ? 0 : x < 115 ? 1 : (x - 35) / 40
/**
* Quantizes 24-bit RGB to xterm256 code range [16,256).
*/
static int rgb2xterm256(int r, int g, int b) {
unsigned char cube[] = {0, 0137, 0207, 0257, 0327, 0377};
int av, ir, ig, ib, il, qr, qg, qb, ql;
av = r * .299 + g * .587 + b * .114 + .5;
ql = (il = av > 238 ? 23 : (av - 3) / 10) * 10 + 8;
qr = cube[(ir = UNCUBE(r))];
qg = cube[(ig = UNCUBE(g))];
qb = cube[(ib = UNCUBE(b))];
if (SQR(qr - r) + SQR(qg - g) + SQR(qb - b) <=
SQR(ql - r) + SQR(ql - g) + SQR(ql - b))
return ir * 36 + ig * 6 + ib + 020;
return il + 0350;
}
static std::string set_xterm256_foreground(int r, int g, int b) {
int x = rgb2xterm256(r, g, b);
std::ostringstream oss;
oss << "\033[38;5;" << x << "m";
return oss.str();
}
// Lowest is red, middle is yellow, highest is green. Color scheme from
// Paul Tol; it is colorblind friendly https://personal.sron.nl/~pault/
// Terminal color map. 10 colors grouped in ranges [0.0, 0.1, ..., 0.9]
// Lowest is red, middle is yellow, highest is green.
const std::vector<std::string> k_colors = {
set_xterm256_foreground(220, 5, 12),
set_xterm256_foreground(232, 96, 28),
set_xterm256_foreground(241, 147, 45),
set_xterm256_foreground(246, 193, 65),
set_xterm256_foreground(247, 240, 86),
set_xterm256_foreground(144, 201, 135),
set_xterm256_foreground( 78, 178, 101),
"\033[38;5;196m", "\033[38;5;202m", "\033[38;5;208m", "\033[38;5;214m", "\033[38;5;220m",
"\033[38;5;226m", "\033[38;5;190m", "\033[38;5;154m", "\033[38;5;118m", "\033[38;5;82m",
};
//

File diff suppressed because it is too large Load Diff

View File

@ -321,7 +321,7 @@ int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_da
LOG("Couldn't map input file %s\n", ifname.c_str());
return err;
}
LOG("Mapped input file: %s size: %d\n", ibuf, (int) ibuf_size);
LOG("Mapped input file: %x size: %d\n", ibuf, ibuf_size);
struct audio_buffer inaudio_buf;
inaudio_buf.ptr = ibuf;
inaudio_buf.size = ibuf_size;

View File

@ -48,7 +48,7 @@ if [ -n "$3" ]; then
fi
# Whisper models
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" "large-v3-turbo" )
models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large-v3" )
# list available models
function list_models {

View File

@ -997,7 +997,6 @@ int main(int argc, char ** argv) {
if (params.dtw == "large.v1") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V1;
if (params.dtw == "large.v2") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V2;
if (params.dtw == "large.v3") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3;
if (params.dtw == "large.v3.turbo") cparams.dtw_aheads_preset = WHISPER_AHEADS_LARGE_V3_TURBO;
if (cparams.dtw_aheads_preset == WHISPER_AHEADS_NONE) {
fprintf(stderr, "error: unknown DTW preset '%s'\n", params.dtw.c_str());

View File

@ -21,7 +21,7 @@ def process_audio(wav_file, model_name="base.en"):
if not os.path.exists(wav_file):
raise FileNotFoundError(f"WAV file not found: {wav_file}")
full_command = f"./main -m {model} -f {wav_file} -nt"
full_command = f"./main -m {model} -f {wav_file} -np -nt"
# Execute the command
process = subprocess.Popen(full_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

View File

@ -34,7 +34,6 @@ struct server_params
std::string hostname = "127.0.0.1";
std::string public_path = "examples/server/public";
std::string request_path = "";
std::string inference_path = "/inference";
int32_t port = 8080;
int32_t read_timeout = 600;
@ -133,7 +132,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " --port PORT, [%-7d] Port number for the server\n", sparams.port);
fprintf(stderr, " --public PATH, [%-7s] Path to the public folder\n", sparams.public_path.c_str());
fprintf(stderr, " --request-path PATH, [%-7s] Request path for all requests\n", sparams.request_path.c_str());
fprintf(stderr, " --inference-path PATH, [%-7s] Inference path for all requests\n", sparams.inference_path.c_str());
fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
fprintf(stderr, "\n");
}
@ -184,7 +182,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
else if ( arg == "--host") { sparams.hostname = argv[++i]; }
else if ( arg == "--public") { sparams.public_path = argv[++i]; }
else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
@ -219,7 +216,7 @@ void check_ffmpeg_availibility() {
bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
std::ostringstream cmd_stream;
std::string converted_filename_temp = temp_filename + "_temp.wav";
cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -y -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
std::string cmd = cmd_stream.str();
int status = std::system(cmd.c_str());
@ -647,10 +644,10 @@ int main(int argc, char ** argv) {
return false;
});
svr.Options(sparams.request_path + sparams.inference_path, [&](const Request &, Response &){
svr.Options(sparams.request_path + "/inference", [&](const Request &, Response &){
});
svr.Post(sparams.request_path + sparams.inference_path, [&](const Request &req, Response &res){
svr.Post(sparams.request_path + "/inference", [&](const Request &req, Response &res){
// acquire whisper model mutex lock
std::lock_guard<std::mutex> lock(whisper_mutex);
@ -677,8 +674,7 @@ int main(int argc, char ** argv) {
if (sparams.ffmpeg_converter) {
// if file is not wav, convert to wav
// write to temporary file
const std::string temp_filename_base = std::tmpnam(nullptr);
const std::string temp_filename = temp_filename_base + ".wav";
const std::string temp_filename = "whisper_server_temp_file.wav";
std::ofstream temp_file{temp_filename, std::ios::binary};
temp_file << audio_file.content;
temp_file.close();

View File

@ -1,13 +1,7 @@
if (WHISPER_SDL2)
# talk-llama
set(TARGET talk-llama)
add_executable(${TARGET} talk-llama.cpp
llama.cpp
llama-vocab.cpp
llama-grammar.cpp
llama-sampling.cpp
unicode.cpp
unicode-data.cpp)
add_executable(${TARGET} talk-llama.cpp llama.cpp unicode.cpp unicode-data.cpp)
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
if (WHISPER_CLBLAST)

File diff suppressed because it is too large Load Diff

View File

@ -1,144 +0,0 @@
#pragma once
#include "llama-impl.h"
#include <map>
struct llama_vocab;
// grammar element type
enum llama_gretype {
// end of rule definition
LLAMA_GRETYPE_END = 0,
// start of alternate definition for rule
LLAMA_GRETYPE_ALT = 1,
// non-terminal element: reference to rule
LLAMA_GRETYPE_RULE_REF = 2,
// terminal element: character (code point)
LLAMA_GRETYPE_CHAR = 3,
// inverse char(s) ([^a], [^a-b] [^abc])
LLAMA_GRETYPE_CHAR_NOT = 4,
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
// be an inclusive range ([a-z])
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
// modifies a preceding LLAMA_GRETYPE_CHAR or
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
LLAMA_GRETYPE_CHAR_ALT = 6,
// any character (.)
LLAMA_GRETYPE_CHAR_ANY = 7,
};
typedef struct llama_grammar_element {
enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID
} llama_grammar_element;
struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};
struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
};
using llama_grammar_rule = std::vector< llama_grammar_element>;
using llama_grammar_stack = std::vector<const llama_grammar_element *>;
using llama_grammar_rules = std::vector<llama_grammar_rule>;
using llama_grammar_stacks = std::vector<llama_grammar_stack>;
using llama_grammar_candidates = std::vector<llama_grammar_candidate>;
const llama_grammar_rules & llama_grammar_get_rules (const struct llama_grammar * grammar);
llama_grammar_stacks & llama_grammar_get_stacks( struct llama_grammar * grammar);
// takes a set of possible pushdown stacks on a grammar, which are required to
// be positioned at a character range (see `llama_grammar_advance_stack`), and
// produces the N possible stacks if the given char is accepted at those
// positions
void llama_grammar_accept(
const llama_grammar_rules & rules,
const llama_grammar_stacks & stacks,
uint32_t chr,
llama_grammar_stacks & stacks_new);
std::vector<llama_grammar_candidate> llama_grammar_reject_candidates_for_stack(
const llama_grammar_rules & rules,
const llama_grammar_stack & stack,
const llama_grammar_candidates & candidates);
struct llama_grammar_parser {
std::map<std::string, uint32_t> symbol_ids;
llama_grammar_rules rules;
llama_grammar_stack c_rules() const;
uint32_t get_symbol_id(const char * src, size_t len);
uint32_t generate_symbol_id(const std::string & base_name);
void add_rule(uint32_t rule_id, const llama_grammar_rule & rule);
const char * parse_alternates(
const char * src,
const std::string & rule_name,
uint32_t rule_id,
bool is_nested);
const char * parse_sequence(
const char * src,
const std::string & rule_name,
llama_grammar_rule & rule,
bool is_nested);
const char * parse_rule(const char * src);
bool parse(const char * src);
void print(FILE * file);
};
struct llama_grammar {
// note: allow null vocab for testing (not great)
const llama_vocab * vocab;
const llama_grammar_rules rules; // TODO: shared ptr
llama_grammar_stacks stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
//
// internal API
//
// note: needed for tests (not great)
struct llama_grammar * llama_grammar_init_impl(
const struct llama_vocab * vocab,
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index);
struct llama_grammar * llama_grammar_init_impl(const struct llama_vocab * vocab, const char * grammar_str, const char * grammar_root);
void llama_grammar_free_impl(struct llama_grammar * grammar);
struct llama_grammar * llama_grammar_clone_impl(const struct llama_grammar & grammar);
// TODO: move the API below as member functions of llama_grammar
void llama_grammar_apply_impl(
const struct llama_grammar & grammar,
llama_token_data_array * cur_p);
void llama_grammar_accept_impl(
struct llama_grammar & grammar,
llama_token token);

View File

@ -1,181 +0,0 @@
#pragma once
#include "llama.h"
#include <string>
#include <vector>
#include <stdexcept>
#ifdef __GNUC__
#ifdef __MINGW32__
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
#else
#define LLAMA_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
#endif
#else
#define LLAMA_ATTRIBUTE_FORMAT(...)
#endif
//
// logging
//
LLAMA_ATTRIBUTE_FORMAT(2, 3)
void llama_log_internal (ggml_log_level level, const char * format, ...);
void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data);
#define LLAMA_LOG(...) llama_log_internal(GGML_LOG_LEVEL_NONE , __VA_ARGS__)
#define LLAMA_LOG_INFO(...) llama_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
#define LLAMA_LOG_WARN(...) llama_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
#define LLAMA_LOG_ERROR(...) llama_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
#define LLAMA_LOG_DEBUG(...) llama_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
#define LLAMA_LOG_CONT(...) llama_log_internal(GGML_LOG_LEVEL_CONT , __VA_ARGS__)
//
// helpers
//
struct time_meas {
time_meas(int64_t & t_acc, bool disable = false) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
~time_meas() {
if (t_start_us >= 0) {
t_acc += ggml_time_us() - t_start_us;
}
}
const int64_t t_start_us;
int64_t & t_acc;
};
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
if (search.empty()) {
return;
}
std::string builder;
builder.reserve(s.length());
size_t pos = 0;
size_t last_pos = 0;
while ((pos = s.find(search, last_pos)) != std::string::npos) {
builder.append(s, last_pos, pos - last_pos);
builder.append(replace);
last_pos = pos + search.length();
}
builder.append(s, last_pos, std::string::npos);
s = std::move(builder);
}
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
// the ring buffer works similarly to std::deque, but with a fixed capacity
template<typename T>
struct ring_buffer {
ring_buffer(size_t cap) : capacity(cap), data(cap) {}
T & front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
const T & front() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[first];
}
T & back() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
const T & back() const {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
return data[pos];
}
void push_back(const T & value) {
if (capacity == 0) {
throw std::runtime_error("ring buffer: capacity is zero");
}
if (sz == capacity) {
// advance the start when buffer is full
first = (first + 1) % capacity;
} else {
sz++;
}
data[pos] = value;
pos = (pos + 1) % capacity;
}
T pop_front() {
if (sz == 0) {
throw std::runtime_error("ring buffer is empty");
}
T value = data[first];
first = (first + 1) % capacity;
sz--;
return value;
}
//T & operator[](size_t i) {
// if (i >= sz) {
// throw std::runtime_error("ring buffer: index out of bounds");
// }
// return data[(first + i) % capacity];
//}
//const T & at(size_t i) const {
// if (i >= sz) {
// throw std::runtime_error("ring buffer: index out of bounds");
// }
// return data[(first + i) % capacity];
//}
const T & rat(size_t i) const {
if (i >= sz) {
throw std::runtime_error("ring buffer: index out of bounds");
}
return data[(first + sz - i - 1) % capacity];
}
std::vector<T> to_vector() const {
std::vector<T> result;
result.reserve(sz);
for (size_t i = 0; i < sz; i++) {
result.push_back(data[(first + i) % capacity]);
}
return result;
}
void clear() {
// here only reset the status of the buffer
sz = 0;
first = 0;
pos = 0;
}
bool empty() const {
return sz == 0;
}
size_t size() const {
return sz;
}
size_t capacity = 0;
size_t sz = 0;
size_t first = 0;
size_t pos = 0;
std::vector<T> data;
};

File diff suppressed because it is too large Load Diff

View File

@ -1,29 +0,0 @@
#pragma once
// TODO: rename llama-sampling.h/.cpp to llama-sampler.h/.cpp ?
#include "llama-grammar.h"
#include <unordered_map>
struct llama_vocab;
struct llama_grammar;
// sampler chain
struct llama_sampler_chain {
llama_sampler_chain_params params;
std::vector<struct llama_sampler *> samplers;
// timing
mutable int64_t t_sample_us;
mutable int32_t n_sample;
};
struct llama_sampler * llama_sampler_init_grammar_impl(
const struct llama_vocab & vocab,
const char * grammar_str,
const char * grammar_root);

File diff suppressed because it is too large Load Diff

View File

@ -1,146 +0,0 @@
#pragma once
#include "llama-impl.h"
#include <string>
#include <vector>
#include <unordered_map>
#include <map>
#include <set>
struct llm_tokenizer;
struct llama_vocab {
using id = llama_token;
using token = std::string;
using tattr = llama_token_attr;
struct token_data {
token text;
float score;
tattr attr;
};
uint32_t n_vocab = 0; // TODO: not great because has to keep in sync with hparams.n_vocab
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
std::unordered_map<token, id> token_to_id;
std::vector<token_data> id_to_token;
std::vector<id> cache_special_tokens;
std::vector<token> cache_token_to_piece; // llama_token_to_piece(special = true);
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
id special_bos_id = 1;
id special_eos_id = 2;
id special_unk_id = 0;
id special_sep_id = -1;
id special_pad_id = -1;
id special_cls_id = -1;
id special_mask_id = -1;
id linefeed_id = 13;
id special_prefix_id = -1;
id special_suffix_id = -1;
id special_middle_id = -1;
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
id special_eom_id = -1;
// set of all tokens that cause "end of generation"
std::set<id> special_eog_ids;
// tokenizer flags
bool tokenizer_add_space_prefix = false;
bool tokenizer_add_bos = false;
bool tokenizer_add_eos = false;
bool tokenizer_ignore_merges = false;
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
bool tokenizer_remove_extra_whitespaces = false;
bool tokenizer_escape_whitespaces = true;
bool tokenizer_treat_whitespace_as_suffix = false;
std::vector<char> precompiled_charsmap;
llm_tokenizer * tokenizer = nullptr;
llama_vocab() = default;
~llama_vocab();
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
void init_tokenizer();
};
//
// internal API
//
// TODO: rename to llama_tokenize_impl
// TODO: This should probably be in llama.h
std::vector<llama_vocab::id> llama_tokenize_internal(
const llama_vocab & vocab,
std::string raw_text,
bool add_special,
bool parse_special = false);
// TODO: move the API below as member functions of llama_vocab
llama_token llama_byte_to_token_impl(const llama_vocab & vocab, uint8_t ch);
const char * llama_token_get_text_impl(const struct llama_vocab & vocab, llama_token token);
float llama_token_get_score_impl(const struct llama_vocab & vocab, llama_token token);
llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token);
bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token token);
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
const char * text,
int32_t text_len,
llama_token * tokens,
int32_t n_tokens_max,
bool add_special,
bool parse_special);
// does not write null-terminator to buf
int32_t llama_token_to_piece_impl(
const struct llama_vocab & vocab,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special);
int32_t llama_detokenize_impl(
const struct llama_vocab & vocab,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special);

File diff suppressed because it is too large Load Diff

View File

@ -33,18 +33,17 @@
#define LLAMA_DEFAULT_SEED 0xFFFFFFFF
// TODO: use everywhere in the implementation
#define LLAMA_TOKEN_NULL -1
#define LLAMA_MAX_RNG_STATE (64*1024)
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
#define LLAMA_FILE_MAGIC_GGSQ 0x67677371u // 'ggsq'
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
#define LLAMA_SESSION_VERSION 9
#define LLAMA_SESSION_VERSION 6
#define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ
#define LLAMA_STATE_SEQ_VERSION 2
#define LLAMA_STATE_SEQ_VERSION 1
#ifdef __cplusplus
extern "C" {
@ -56,10 +55,8 @@ extern "C" {
// TODO: show sample usage
//
// struct llama_vocab; // TODO: add in the future
struct llama_model;
struct llama_context;
struct llama_sampler;
typedef int32_t llama_pos;
typedef int32_t llama_token;
@ -70,8 +67,6 @@ extern "C" {
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
LLAMA_VOCAB_TYPE_UGM = 4, // T5 tokenizer based on Unigram
LLAMA_VOCAB_TYPE_RWKV = 5, // RWKV tokenizer based on greedy tokenization
};
// pre-tokenization types
@ -92,23 +87,15 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
LLAMA_VOCAB_PRE_TYPE_PORO = 15,
LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
};
// note: these values should be synchronized with ggml_rope
// TODO: maybe move this enum to ggml.h (ggml_rope_type)
enum llama_rope_type {
LLAMA_ROPE_TYPE_NONE = -1,
LLAMA_ROPE_TYPE_NORM = 0,
LLAMA_ROPE_TYPE_NEOX = GGML_ROPE_TYPE_NEOX,
LLAMA_ROPE_TYPE_NORM = 0,
LLAMA_ROPE_TYPE_NEOX = 2,
LLAMA_ROPE_TYPE_GLM = 4,
};
enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@ -141,7 +128,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
// LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
// LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // support has been removed
// LLAMA_FTYPE_MOSTLY_Q4_3 = 6, // support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
@ -170,11 +157,6 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
@ -193,22 +175,14 @@ extern "C" {
LLAMA_POOLING_TYPE_MEAN = 1,
LLAMA_POOLING_TYPE_CLS = 2,
LLAMA_POOLING_TYPE_LAST = 3,
LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph
};
enum llama_attention_type {
LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1,
LLAMA_ATTENTION_TYPE_CAUSAL = 0,
LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1,
};
enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
};
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
typedef struct llama_token_data {
llama_token id; // token id
float logit; // log-odds of the token
@ -216,10 +190,8 @@ extern "C" {
} llama_token_data;
typedef struct llama_token_data_array {
// TODO: consider SoA
llama_token_data * data;
size_t size;
int64_t selected; // this is the index in the data array (i.e. not the token id)
bool sorted;
} llama_token_data_array;
@ -280,9 +252,9 @@ extern "C" {
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// main_gpu interpretation depends on split_mode:
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_MODE_LAYER: ignored
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_LAYER: ignored
int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@ -312,16 +284,16 @@ extern "C" {
// NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations
// https://github.com/ggerganov/llama.cpp/pull/7544
struct llama_context_params {
uint32_t seed; // RNG seed, -1 for random
uint32_t n_ctx; // text context, 0 = from model
uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode
uint32_t n_ubatch; // physical maximum batch size
uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models)
int32_t n_threads; // number of threads to use for generation
int32_t n_threads_batch; // number of threads to use for batch processing
uint32_t n_threads; // number of threads to use for generation
uint32_t n_threads_batch; // number of threads to use for batch processing
enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id
enum llama_attention_type attention_type; // attention type to use for embeddings
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
float rope_freq_base; // RoPE base frequency, 0 = from model
@ -339,13 +311,11 @@ extern "C" {
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
// TODO: move at the end of the struct
// Keep the booleans together to avoid misalignment during copy-by-value.
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
bool embeddings; // if true, extract embeddings (together with logits)
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
// Abort callback
// if it returns true, execution of llama_decode() will be aborted
@ -359,7 +329,7 @@ extern "C" {
int32_t nthread; // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
enum llama_ftype ftype; // quantize to this llama_ftype
enum ggml_type output_tensor_type; // output tensor type
enum ggml_type token_embedding_type; // token embeddings tensor type
enum ggml_type token_embedding_type; // itoken embeddings tensor type
bool allow_requantize; // allow quantizing non-f32/f16 tensors
bool quantize_output_tensor; // quantize output.weight
bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
@ -369,14 +339,56 @@ extern "C" {
void * kv_overrides; // pointer to vector containing overrides
} llama_model_quantize_params;
typedef struct llama_logit_bias {
llama_token token;
float bias;
} llama_logit_bias;
// grammar types
struct llama_grammar;
typedef struct llama_sampler_chain_params {
bool no_perf; // whether to measure performance timings
} llama_sampler_chain_params;
// grammar element type
enum llama_gretype {
// end of rule definition
LLAMA_GRETYPE_END = 0,
// start of alternate definition for rule
LLAMA_GRETYPE_ALT = 1,
// non-terminal element: reference to rule
LLAMA_GRETYPE_RULE_REF = 2,
// terminal element: character (code point)
LLAMA_GRETYPE_CHAR = 3,
// inverse char(s) ([^a], [^a-b] [^abc])
LLAMA_GRETYPE_CHAR_NOT = 4,
// modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to
// be an inclusive range ([a-z])
LLAMA_GRETYPE_CHAR_RNG_UPPER = 5,
// modifies a preceding LLAMA_GRETYPE_CHAR or
// LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA])
LLAMA_GRETYPE_CHAR_ALT = 6,
// any character (.)
LLAMA_GRETYPE_CHAR_ANY = 7,
};
typedef struct llama_grammar_element {
enum llama_gretype type;
uint32_t value; // Unicode code point or rule ID
} llama_grammar_element;
// performance timing information
struct llama_timings {
double t_start_ms;
double t_end_ms;
double t_load_ms;
double t_sample_ms;
double t_p_eval_ms;
double t_eval_ms;
int32_t n_sample;
int32_t n_p_eval;
int32_t n_eval;
};
// used in chat template
typedef struct llama_chat_message {
@ -384,14 +396,9 @@ extern "C" {
const char * content;
} llama_chat_message;
// lora adapter
struct llama_lora_adapter;
// Helpers for getting default parameters
// TODO: update API to start accepting pointers to params structs (https://github.com/ggerganov/llama.cpp/discussions/9172)
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void);
LLAMA_API struct llama_model_params llama_model_default_params(void);
LLAMA_API struct llama_context_params llama_context_default_params(void);
LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void);
// Initialize the llama + ggml backend
@ -402,23 +409,15 @@ extern "C" {
//optional:
LLAMA_API void llama_numa_init(enum ggml_numa_strategy numa);
// Optional: an auto threadpool gets created in ggml if not passed explicitly
LLAMA_API void llama_attach_threadpool(
struct llama_context * ctx,
ggml_threadpool_t threadpool,
ggml_threadpool_t threadpool_batch);
LLAMA_API void llama_detach_threadpool(struct llama_context * ctx);
// Call once at the end of the program - currently only used for MPI
LLAMA_API void llama_backend_free(void);
LLAMA_API struct llama_model * llama_load_model_from_file(
const char * path_model,
struct llama_model_params params);
struct llama_model_params params);
LLAMA_API void llama_free_model(struct llama_model * model);
// TODO: rename to llama_init_from_model
LLAMA_API struct llama_context * llama_new_context_with_model(
struct llama_model * model,
struct llama_context_params params);
@ -434,22 +433,22 @@ extern "C" {
LLAMA_API bool llama_supports_mlock (void);
LLAMA_API bool llama_supports_gpu_offload(void);
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
LLAMA_API int32_t llama_n_vocab (const struct llama_model * model);
LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model);
LLAMA_API int32_t llama_n_embd (const struct llama_model * model);
LLAMA_API int32_t llama_n_layer (const struct llama_model * model);
LLAMA_API int32_t llama_n_head (const struct llama_model * model);
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx);
LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model);
LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model);
// Get the model's RoPE frequency scaling factor
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
@ -483,51 +482,24 @@ extern "C" {
// Get a llama model tensor
LLAMA_API struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
// Returns true if the model contains an encoder that requires llama_encode() call
LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
// Returns true if the model contains a decoder that requires llama_decode() call
LLAMA_API bool llama_model_has_decoder(const struct llama_model * model);
// For encoder-decoder models, this function returns id of the token that must be provided
// to the decoder to start generating output sequence. For other models, it returns -1.
LLAMA_API llama_token llama_model_decoder_start_token(const struct llama_model * model);
// Returns true if the model is recurrent (like Mamba, RWKV, etc.)
LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model);
// Returns 0 on success
LLAMA_API uint32_t llama_model_quantize(
const char * fname_inp,
const char * fname_out,
const llama_model_quantize_params * params);
// Load a LoRA adapter from file
// The loaded adapter will be associated to the given model, and will be free when the model is deleted
LLAMA_API struct llama_lora_adapter * llama_lora_adapter_init(
struct llama_model * model,
const char * path_lora);
// Add a loaded LoRA adapter to given context
// This will not modify model's weight
LLAMA_API int32_t llama_lora_adapter_set(
struct llama_context * ctx,
struct llama_lora_adapter * adapter,
float scale);
// Remove a specific LoRA adapter from given context
// Return -1 if the adapter is not present in the context
LLAMA_API int32_t llama_lora_adapter_remove(
struct llama_context * ctx,
struct llama_lora_adapter * adapter);
// Remove all LoRA adapters from given context
LLAMA_API void llama_lora_adapter_clear(
struct llama_context * ctx);
// Manually free a LoRA adapter
// Note: loaded adapters will be free when the associated model is deleted
LLAMA_API void llama_lora_adapter_free(struct llama_lora_adapter * adapter);
// Apply a LoRA adapter to a loaded model
// path_base_model is the path to a higher quality model to use as a base for
// the layers modified by the adapter. Can be NULL to use the current loaded model.
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// will be applied on top of the previous one
// Returns 0 on success
LLAMA_API int32_t llama_model_apply_lora_from_file(
const struct llama_model * model,
const char * path_lora,
float scale,
const char * path_base_model,
int32_t n_threads);
// Apply a loaded control vector to a llama_context, or if data is NULL, clear
// the currently loaded vector.
@ -677,11 +649,10 @@ extern "C" {
// State / sessions
//
// Returns the *actual* size in bytes of the state
// (logits, embedding and kv_cache)
// Only use when saving the state, not when restoring it, otherwise the size may be too small.
LLAMA_API size_t llama_state_get_size(struct llama_context * ctx);
LLAMA_API DEPRECATED(size_t llama_get_state_size(struct llama_context * ctx),
// Returns the maximum size in bytes of the state (rng, logits, embedding
// and kv_cache) - will often be smaller after compacting tokens
LLAMA_API size_t llama_state_get_size(const struct llama_context * ctx);
LLAMA_API DEPRECATED(size_t llama_get_state_size(const struct llama_context * ctx),
"use llama_state_get_size instead");
// Copies the state to the specified destination address.
@ -689,8 +660,7 @@ extern "C" {
// Returns the number of bytes copied
LLAMA_API size_t llama_state_get_data(
struct llama_context * ctx,
uint8_t * dst,
size_t size);
uint8_t * dst);
LLAMA_API DEPRECATED(size_t llama_copy_state_data(
struct llama_context * ctx,
uint8_t * dst),
@ -700,8 +670,7 @@ extern "C" {
// Returns the number of bytes read
LLAMA_API size_t llama_state_set_data(
struct llama_context * ctx,
const uint8_t * src,
size_t size);
const uint8_t * src);
LLAMA_API DEPRECATED(size_t llama_set_state_data(
struct llama_context * ctx,
const uint8_t * src),
@ -743,7 +712,6 @@ extern "C" {
LLAMA_API size_t llama_state_seq_get_data(
struct llama_context * ctx,
uint8_t * dst,
size_t size,
llama_seq_id seq_id);
// Copy the sequence data (originally copied with `llama_state_seq_get_data`) into the specified sequence
@ -753,7 +721,6 @@ extern "C" {
LLAMA_API size_t llama_state_seq_set_data(
struct llama_context * ctx,
const uint8_t * src,
size_t size,
llama_seq_id dest_seq_id);
LLAMA_API size_t llama_state_seq_save_file(
@ -800,14 +767,6 @@ extern "C" {
// Frees a batch of tokens allocated with llama_batch_init()
LLAMA_API void llama_batch_free(struct llama_batch batch);
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
// Stores the encoder output internally for later use by the decoder cross-attention layers.
// 0 - success
// < 0 - error
LLAMA_API int32_t llama_encode(
struct llama_context * ctx,
struct llama_batch batch);
// Positive return values does not mean a fatal error, but rather a warning.
// 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
@ -819,13 +778,13 @@ extern "C" {
// Set the number of threads used for decoding
// n_threads is the number of threads used for generation (single token)
// n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens)
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch);
LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch);
// Get the number of threads used for generation of a single token.
LLAMA_API int32_t llama_n_threads(struct llama_context * ctx);
LLAMA_API uint32_t llama_n_threads(struct llama_context * ctx);
// Get the number of threads used for prompt and batch processing (multiple token).
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
LLAMA_API uint32_t llama_n_threads_batch(struct llama_context * ctx);
// Set whether the model is in embeddings mode or not
// If true, embeddings will be returned but logits will not
@ -873,8 +832,7 @@ extern "C" {
// Get the embeddings for a sequence id
// Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE
// when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence
// otherwise: float[n_embd] (1-dimensional)
// shape: [n_embd] (1-dimensional)
LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id);
//
@ -899,10 +857,12 @@ extern "C" {
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
// Returns -1 if unknown, 1 for true or 0 for false.
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
// Returns -1 if unknown, 1 for true or 0 for false.
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
// Codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
@ -913,14 +873,11 @@ extern "C" {
//
// Tokenization
//
// The API is thread-safe.
//
/// @details Convert the provided text into tokens.
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
/// @return Returns the number of tokens on success, no more than n_tokens_max
/// @return Returns a negative number on failure - the number of tokens that would have been returned
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
/// as plaintext. Does not insert a leading space.
LLAMA_API int32_t llama_tokenize(
@ -935,35 +892,15 @@ extern "C" {
// Token Id -> Piece.
// Uses the vocabulary in the provided context.
// Does not write null terminator to the buffer.
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
// @param special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_token_to_piece(
const struct llama_model * model,
llama_token token,
char * buf,
int32_t length,
int32_t lstrip,
bool special);
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
/// @param text The char pointer must be large enough to hold the resulting text.
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
/// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
/// @param unparse_special If true, special tokens are rendered in the output.
LLAMA_API int32_t llama_detokenize(
const struct llama_model * model,
const llama_token * tokens,
int32_t n_tokens,
char * text,
int32_t text_len_max,
bool remove_special,
bool unparse_special);
//
// Chat templates
//
/// Apply chat template. Inspired by hf apply_chat_template() on python.
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template
@ -984,114 +921,104 @@ extern "C" {
int32_t length);
//
// Sampling API
//
// Sample usage:
//
// // prepare the sampling chain at the start
// auto sparams = llama_sampler_chain_default_params();
//
// llama_sampler * smpl = llama_sampler_chain_init(sparams);
//
// llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50));
// llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1));
// llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8));
//
// // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat"
// // this sampler will be responsible to select the actual token
// llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed));
//
// ...
//
// // decoding loop:
// while (...) {
// ...
//
// llama_decode(ctx, batch);
//
// // sample from the logits of the last token in the batch
// const llama_token id = llama_sampler_sample(smpl, ctx, -1);
//
// // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.)
// llama_sampler_accept(smpl, id);
// ...
// }
//
// llama_sampler_free(smpl);
//
// TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU).
// TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab
// Grammar
//
typedef void * llama_sampler_context_t;
LLAMA_API struct llama_grammar * llama_grammar_init(
const llama_grammar_element ** rules,
size_t n_rules,
size_t start_rule_index);
// user code can implement the interface below in order to create custom llama_sampler
struct llama_sampler_i {
const char * (*name) (const struct llama_sampler * smpl); // can be NULL
void (*accept)( struct llama_sampler * smpl, llama_token token); // can be NULL
void (*apply) ( struct llama_sampler * smpl, llama_token_data_array * cur_p); // required
void (*reset) ( struct llama_sampler * smpl); // can be NULL
struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL
void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL
LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
// TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph
//void (*apply_ggml) (struct llama_sampler * smpl, ...);
};
LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
struct llama_sampler {
struct llama_sampler_i * iface;
llama_sampler_context_t ctx;
};
//
// Sampling functions
//
// mirror of llama_sampler_i:
LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl);
LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token);
LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p);
LLAMA_API void llama_sampler_reset ( struct llama_sampler * smpl);
LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl);
// important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add)
LLAMA_API void llama_sampler_free ( struct llama_sampler * smpl);
// Sets the current rng seed.
LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed);
// llama_sampler_chain
// a type of llama_sampler that can chain multiple samplers one after another
/// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
/// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
LLAMA_API void llama_sample_repetition_penalties(
struct llama_context * ctx,
llama_token_data_array * candidates,
const llama_token * last_tokens,
size_t penalty_last_n,
float penalty_repeat,
float penalty_freq,
float penalty_present);
LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params);
// important: takes ownership of the sampler object and will free it when llama_sampler_free is called
LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl);
LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i);
LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain);
// after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed
LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i);
// available samplers:
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
/// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
/// @param logits Logits extracted from the original generation context.
/// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context.
/// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance.
LLAMA_API void llama_sample_apply_guidance(
struct llama_context * ctx,
float * logits,
float * logits_guidance,
float scale);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
LLAMA_API void llama_sample_softmax(
struct llama_context * ctx,
llama_token_data_array * candidates);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
LLAMA_API void llama_sample_top_k(
struct llama_context * ctx,
llama_token_data_array * candidates,
int32_t k,
size_t min_keep);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep);
LLAMA_API void llama_sample_top_p(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
LLAMA_API void llama_sample_min_p(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep);
LLAMA_API void llama_sample_tail_free(
struct llama_context * ctx,
llama_token_data_array * candidates,
float z,
size_t min_keep);
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
LLAMA_API void llama_sample_typical(
struct llama_context * ctx,
llama_token_data_array * candidates,
float p,
size_t min_keep);
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
/// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.
LLAMA_API void llama_sample_entropy(
struct llama_context * ctx,
llama_token_data_array * candidates_p,
float min_temp,
float max_temp,
float exponent_val);
LLAMA_API void llama_sample_temp(
struct llama_context * ctx,
llama_token_data_array * candidates,
float temp);
/// @details Apply constraints from grammar
LLAMA_API void llama_sample_grammar(
struct llama_context * ctx,
llama_token_data_array * candidates,
const struct llama_grammar * grammar);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
@ -1099,62 +1026,42 @@ extern "C" {
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API struct llama_sampler * llama_sampler_init_mirostat(
int32_t n_vocab,
uint32_t seed,
float tau,
float eta,
int32_t m);
LLAMA_API llama_token llama_sample_token_mirostat(
struct llama_context * ctx,
llama_token_data_array * candidates,
float tau,
float eta,
int32_t m,
float * mu);
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2(
uint32_t seed,
float tau,
float eta);
LLAMA_API llama_token llama_sample_token_mirostat_v2(
struct llama_context * ctx,
llama_token_data_array * candidates,
float tau,
float eta,
float * mu);
LLAMA_API struct llama_sampler * llama_sampler_init_grammar(
const struct llama_model * model,
const char * grammar_str,
const char * grammar_root);
/// @details Selects the token with the highest probability.
/// Does not compute the token probabilities. Use llama_sample_softmax() instead.
LLAMA_API llama_token llama_sample_token_greedy(
struct llama_context * ctx,
llama_token_data_array * candidates);
LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
int32_t n_vocab, // llama_n_vocab()
llama_token special_eos_id, // llama_token_eos()
llama_token linefeed_id, // llama_token_nl()
int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
float penalty_repeat, // 1.0 = disabled
float penalty_freq, // 0.0 = disabled
float penalty_present, // 0.0 = disabled
bool penalize_nl, // consider newlines as a repeatable token
bool ignore_eos); // ignore the end-of-sequence token
/// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
LLAMA_API llama_token llama_sample_token(
struct llama_context * ctx,
llama_token_data_array * candidates);
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
int32_t n_vocab,
int32_t n_logit_bias,
const llama_logit_bias * logit_bias);
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
/// @details Sample and accept a token from the idx-th output of the last evaluation
//
// Shorthand for:
// const auto * logits = llama_get_logits_ith(ctx, idx);
// llama_token_data_array cur_p = { ... init from logits ... };
// llama_sampler_apply(smpl, &cur_p);
// auto token = cur_p.data[cur_p.selected].id;
// llama_sampler_accept(smpl, token);
// return token;
// Returns the sampled token
LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx);
// TODO: extend in the future
//LLAMA_API void llama_decode_with_sampler(struct llama_context * ctx, struct llama_sampler * smpl, struct llama_batch batch, ...);
/// @details Accepts the sampled token into the grammar
LLAMA_API void llama_grammar_accept_token(
struct llama_context * ctx,
struct llama_grammar * grammar,
llama_token token);
//
// Model split
@ -1170,6 +1077,12 @@ extern "C" {
// Returns the split_prefix length.
LLAMA_API int llama_split_prefix(char * split_prefix, size_t maxlen, const char * split_path, int split_no, int split_count);
// Performance information
LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx);
LLAMA_API void llama_print_timings(struct llama_context * ctx);
LLAMA_API void llama_reset_timings(struct llama_context * ctx);
// Print system information
LLAMA_API const char * llama_print_system_info(void);
@ -1177,41 +1090,58 @@ extern "C" {
// If this is not called, or NULL is supplied, everything is output on stderr.
LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data);
//
// Performance utils
//
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
//
struct llama_perf_context_data {
double t_start_ms;
double t_load_ms;
double t_p_eval_ms;
double t_eval_ms;
int32_t n_p_eval;
int32_t n_eval;
};
struct llama_perf_sampler_data {
double t_sample_ms;
int32_t n_sample;
};
LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx);
LLAMA_API void llama_perf_context_print(const struct llama_context * ctx);
LLAMA_API void llama_perf_context_reset( struct llama_context * ctx);
// NOTE: the following work only with samplers constructed via llama_sampler_chain_init
LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx);
#ifdef __cplusplus
}
#endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
#include <random>
#include <string>
#include <vector>
struct ggml_tensor;
struct llama_partial_utf8 {
uint32_t value; // bit value so far (unshifted)
int n_remain; // num bytes remaining; -1 indicates invalid sequence
};
struct llama_grammar {
const std::vector<std::vector<llama_grammar_element>> rules;
std::vector<std::vector<const llama_grammar_element *>> stacks;
// buffer for partially generated UTF-8 sequence from accepted tokens
llama_partial_utf8 partial_utf8;
};
struct llama_grammar_candidate {
size_t index;
const uint32_t * code_points;
llama_partial_utf8 partial_utf8;
};
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
struct llama_context * ctx
);
void llama_grammar_accept(
const std::vector<std::vector<llama_grammar_element>> & rules,
const std::vector<std::vector<const llama_grammar_element *>> & stacks,
const uint32_t chr,
std::vector<std::vector<const llama_grammar_element *>> & new_stacks);
std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
const std::string & src,
llama_partial_utf8 partial_start);
// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
#endif // LLAMA_API_INTERNAL
#endif // LLAMA_H

View File

@ -35,10 +35,10 @@ static std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
if (n_tokens < 0) {
result.resize(-n_tokens);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), 0, false);
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), false);
GGML_ASSERT(check == -n_tokens);
} else {
result.resize(n_tokens);
@ -314,6 +314,7 @@ int main(int argc, char ** argv) {
// tune these to your liking
lcparams.n_ctx = 2048;
lcparams.seed = 1;
lcparams.n_threads = params.n_threads;
lcparams.flash_attn = params.flash_attn;
@ -401,26 +402,6 @@ int main(int argc, char ** argv) {
llama_batch batch = llama_batch_init(llama_n_ctx(ctx_llama), 0, 1);
// init sampler
const float top_k = 5;
const float top_p = 0.80f;
const float temp = 0.30f;
const int seed = 0;
auto sparams = llama_sampler_chain_default_params();
llama_sampler * smpl = llama_sampler_chain_init(sparams);
if (temp > 0.0f) {
llama_sampler_chain_add(smpl, llama_sampler_init_top_k(top_k));
llama_sampler_chain_add(smpl, llama_sampler_init_top_p(top_p, 1));
llama_sampler_chain_add(smpl, llama_sampler_init_temp (temp));
llama_sampler_chain_add(smpl, llama_sampler_init_dist (seed));
} else {
llama_sampler_chain_add(smpl, llama_sampler_init_greedy());
}
// init session
std::string path_session = params.path_session;
std::vector<llama_token> session_tokens;
@ -436,7 +417,7 @@ int main(int argc, char ** argv) {
session_tokens.resize(llama_n_ctx(ctx_llama));
size_t n_token_count_out = 0;
if (!llama_state_load_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
if (!llama_load_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) {
fprintf(stderr, "%s: error: failed to load session file '%s'\n", __func__, path_session.c_str());
return 1;
}
@ -719,13 +700,54 @@ int main(int argc, char ** argv) {
{
// out of user input, sample next token
const float top_k = 5;
const float top_p = 0.80f;
const float temp = 0.30f;
const float repeat_penalty = 1.1764f;
const int repeat_last_n = 256;
if (!path_session.empty() && need_to_save_session) {
need_to_save_session = false;
llama_state_save_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
llama_save_session_file(ctx_llama, path_session.c_str(), session_tokens.data(), session_tokens.size());
}
const llama_token id = llama_sampler_sample(smpl, ctx_llama, -1);
llama_token id = 0;
{
auto logits = llama_get_logits(ctx_llama);
auto n_vocab = llama_n_vocab(model_llama);
logits[llama_token_eos(model_llama)] = 0;
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// apply repeat penalty
const float nl_logit = logits[llama_token_nl(model_llama)];
llama_sample_repetition_penalties(ctx_llama, &candidates_p,
embd_inp.data() + std::max(0, n_past - repeat_last_n),
repeat_last_n, repeat_penalty, 0.0, 0.0f);
logits[llama_token_nl(model_llama)] = nl_logit;
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
} else {
// Temperature sampling
llama_sample_top_k(ctx_llama, &candidates_p, top_k, 1);
llama_sample_top_p(ctx_llama, &candidates_p, top_p, 1);
llama_sample_temp (ctx_llama, &candidates_p, temp);
id = llama_sample_token(ctx_llama, &candidates_p);
}
}
if (id != llama_token_eos(model_llama)) {
// add it to the context
@ -775,14 +797,8 @@ int main(int argc, char ** argv) {
whisper_print_timings(ctx_wsp);
whisper_free(ctx_wsp);
llama_perf_sampler_print(smpl);
llama_perf_context_print(ctx_llama);
llama_sampler_free(smpl);
llama_batch_free(batch);
llama_print_timings(ctx_llama);
llama_free(ctx_llama);
llama_backend_free();
return 0;
}

View File

@ -7,7 +7,7 @@
#include <unordered_map>
#include <unordered_set>
const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1
const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1
{0x000000, 0x0080},
{0x000020, 0x0008},
{0x000021, 0x0020},
@ -2311,8 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
0x003000,
};
// list is always in ascending order, to enable binary searh
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {
{0x000041, 0x000061},
{0x000042, 0x000062},
{0x000043, 0x000063},
@ -3748,8 +3747,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
{0x01E921, 0x01E943},
};
// list is always in ascending order, to enable binary searh
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {
{0x000061, 0x000041},
{0x000062, 0x000042},
{0x000063, 0x000043},
@ -5202,7 +5200,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase
{0x01E943, 0x01E921},
};
const std::initializer_list<range_nfd> unicode_ranges_nfd = { // start, last, nfd
const std::vector<range_nfd> unicode_ranges_nfd = { // start, last, nfd
{0x000000, 0x000000, 0x000000},
{0x0000C0, 0x0000C5, 0x000041},
{0x0000C7, 0x0000C7, 0x000043},
@ -7032,3 +7030,4 @@ const std::initializer_list<range_nfd> unicode_ranges_nfd = { // start, last, n
{0x02FA1C, 0x02FA1C, 0x009F3B},
{0x02FA1D, 0x02FA1D, 0x02A600},
};

View File

@ -13,8 +13,8 @@ struct range_nfd {
static const uint32_t MAX_CODEPOINTS = 0x110000;
extern const std::initializer_list<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
extern const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags;
extern const std::unordered_set<uint32_t> unicode_set_whitespace;
extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase;
extern const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase;
extern const std::initializer_list<range_nfd> unicode_ranges_nfd;
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase;
extern const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase;
extern const std::vector<range_nfd> unicode_ranges_nfd;

View File

@ -1,11 +1,6 @@
#if defined(_MSC_VER)
#define _SILENCE_CXX17_CODECVT_HEADER_DEPRECATION_WARNING
#endif
#include "unicode.h"
#include "unicode-data.h"
#include <algorithm>
#include <cassert>
#include <cstddef>
#include <cstdint>
@ -20,12 +15,6 @@
#include <locale>
#include <codecvt>
size_t unicode_len_utf8(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
return lookup[highbits];
}
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
std::string result;
for (size_t i = 0; i < cps.size(); ++i) {
@ -34,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
return result;
}
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
assert(offset < utf8.size());
if (!(utf8[offset + 0] & 0x80)) {
auto result = utf8[offset + 0];
@ -123,11 +112,11 @@ uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
static std::vector<codepoint_flags> unicode_cpt_flags_array() {
std::vector<codepoint_flags> cpt_flags(MAX_CODEPOINTS, codepoint_flags::UNDEFINED);
assert (unicode_ranges_flags.begin()[0].first == 0);
assert (unicode_ranges_flags.begin()[unicode_ranges_flags.size()-1].first == MAX_CODEPOINTS);
assert (unicode_ranges_flags.front().first == 0);
assert (unicode_ranges_flags.back().first == MAX_CODEPOINTS);
for (size_t i = 1; i < unicode_ranges_flags.size(); ++i) {
const auto range_ini = unicode_ranges_flags.begin()[i-1]; // codepoint_ini, flags
const auto range_end = unicode_ranges_flags.begin()[i]; // codepoint_end, flags
const auto range_ini = unicode_ranges_flags[i-1]; // codepoint_ini, flags
const auto range_end = unicode_ranges_flags[i]; // codepoint_end, flags
for (uint32_t cpt = range_ini.first; cpt < range_end.first; ++cpt) {
cpt_flags[cpt] = range_ini.second;
}
@ -243,7 +232,8 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
};
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
};
size_t _prev_end = offset_ini;
@ -305,9 +295,9 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
continue;
}
// regex: <space>?[^\s\p{L}\p{N}]+
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
pos += (cpt == ' ');
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
flags2 = _get_flags(++pos);
}
_add_token(pos);
@ -361,7 +351,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
};
auto _get_flags = [&] (const size_t pos) -> codepoint_flags {
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : codepoint_flags{};
static const codepoint_flags undef(codepoint_flags::UNDEFINED);
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_flags(cpts[pos]) : undef;
};
size_t _prev_end = offset_ini;
@ -403,8 +394,8 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
}
}
// regex: [^\r\n\p{L}\p{N}]?\p{L}+
if (!(cpt == '\r' || cpt == '\n' || flags.is_number)) {
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
if (!(cpt == '\r' || cpt == '\n' || /*flags.is_letter |*/ flags.is_number)) {
if (flags.is_letter || _get_flags(pos+1).is_letter) { // one or more letters
pos++;
while (_get_flags(pos).is_letter) {
@ -430,9 +421,9 @@ static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string &
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
auto flags2 = (cpt == ' ' ? _get_flags(pos+1) : flags);
if (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags.as_uint()) {
if (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
pos += (cpt == ' ');
while (!(flags2.is_whitespace | flags2.is_letter | flags2.is_number) && flags2.as_uint()) {
while (!(flags2.is_whitespace || flags2.is_letter || flags2.is_number || flags2.is_undefined)) {
flags2 = _get_flags(++pos);
}
uint32_t cpt2 = _get_cpt(pos);
@ -597,7 +588,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
std::vector<uint32_t> result(cpts.size());
for (size_t i = 0; i < cpts.size(); ++i) {
const uint32_t cpt = cpts[i];
auto it = std::upper_bound(unicode_ranges_nfd.begin(), unicode_ranges_nfd.end(), cpt, comp) - 1;
auto it = std::upper_bound(unicode_ranges_nfd.cbegin(), unicode_ranges_nfd.cend(), cpt, comp) - 1;
result[i] = (it->first <= cpt && cpt <= it->last) ? it->nfd : cpt;
}
return result;
@ -639,15 +630,8 @@ uint8_t unicode_utf8_to_byte(const std::string & utf8) {
}
uint32_t unicode_tolower(uint32_t cp) {
// binary search
auto it = std::lower_bound(unicode_map_lowercase.begin(), unicode_map_lowercase.end(), cp,
[](const std::pair<uint32_t, uint32_t> & pair, uint32_t value) {
return pair.first < value;
});
if (it != unicode_map_lowercase.end() && it->first == cp) {
return it->second;
}
return cp; // Return the original code point if no lowercase mapping is found
auto it = unicode_map_lowercase.find(cp);
return it == unicode_map_lowercase.end() ? cp : it->second;
}
std::vector<std::string> unicode_regex_split(const std::string & text, const std::vector<std::string> & regex_exprs) {

View File

@ -4,8 +4,6 @@
#include <string>
#include <vector>
// TODO: prefix all symbols with "llama_"
struct codepoint_flags {
enum {
UNDEFINED = 0x0001,
@ -48,10 +46,8 @@ struct codepoint_flags {
}
};
size_t unicode_len_utf8(char src);
std::string unicode_cpt_to_utf8(uint32_t cp);
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8);
std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & cpts);

View File

@ -21,7 +21,7 @@ help()
echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
echo "options:"
echo "-s Step in seconds (default is $step)."
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' 'large-v3-turbo' (default is '$model')."
echo "-m Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large-v3' (default is '$model')."
echo "-t Number of threads to use."
echo "-h Print this help page."
echo

View File

@ -7,9 +7,8 @@ set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
set(SOURCE_FILES
${WHISPER_LIB_DIR}/ggml/src/ggml.c
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
${WHISPER_LIB_DIR}/src/whisper.cpp
${CMAKE_SOURCE_DIR}/jni.c

View File

@ -19,9 +19,8 @@ if (NOT GGML_HOME)
SOURCE_FILES
${SOURCE_FILES}
${WHISPER_LIB_DIR}/ggml/src/ggml.c
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.c
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
)
endif()

View File

@ -7,7 +7,6 @@
objects = {
/* Begin PBXBuildFile section */
18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18133C7F2C64E342005CEAAC /* ggml-aarch64.c */; };
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; };
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */ = {isa = PBXBuildFile; fileRef = 1844471B2AB21655007D6BFE /* ggml-metal.m */; settings = {COMPILER_FLAGS = "-framework Foundation -framework Metal -framework MetalKit -fno-objc-arc"; }; };
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
@ -22,7 +21,7 @@
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
18A276062C2A98A5001C8D37 /* ggml-metal.metal in Copy Files */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.c */; };
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
@ -45,8 +44,6 @@
/* End PBXCopyFilesBuildPhase section */
/* Begin PBXFileReference section */
18133C7E2C64E342005CEAAC /* ggml-aarch64.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-aarch64.h"; path = "../../../ggml/src/ggml-aarch64.h"; sourceTree = "<group>"; };
18133C7F2C64E342005CEAAC /* ggml-aarch64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-aarch64.c"; path = "../../../ggml/src/ggml-aarch64.c"; sourceTree = "<group>"; };
184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; };
@ -73,7 +70,7 @@
18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
18ABE1572AF556340044A204 /* ggml-backend.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-backend.c"; path = "../../../ggml/src/ggml-backend.c"; sourceTree = "<group>"; };
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
@ -115,12 +112,10 @@
18627C7829052BDF00BD2A04 /* whisper.objc */ = {
isa = PBXGroup;
children = (
18133C7F2C64E342005CEAAC /* ggml-aarch64.c */,
18133C7E2C64E342005CEAAC /* ggml-aarch64.h */,
18A275FF2C2A9563001C8D37 /* ggml-common.h */,
18A275FE2C2A94DE001C8D37 /* ggml-metal.h */,
18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
18ABE1572AF556340044A204 /* ggml-backend.cpp */,
18ABE1572AF556340044A204 /* ggml-backend.c */,
18ABE1552AF556340044A204 /* ggml-backend.h */,
18ABE1582AF556340044A204 /* ggml-impl.h */,
18ABE1592AF556340044A204 /* ggml-quants.c */,
@ -241,14 +236,13 @@
files = (
18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */,
18133C802C64E342005CEAAC /* ggml-aarch64.c in Sources */,
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */,
18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
18627C9629052C5800BD2A04 /* ggml.c in Sources */,
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
18ABE15A2AF556340044A204 /* ggml-backend.c in Sources */,
18627C8C29052BE000BD2A04 /* main.m in Sources */,
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,

View File

@ -50,24 +50,9 @@ else()
set(GGML_BLAS_VENDOR_DEFAULT "Generic")
endif()
if (CMAKE_CROSSCOMPILING)
set(GGML_NATIVE_DEFAULT OFF)
else()
set(GGML_NATIVE_DEFAULT ON)
endif()
# defaults
if (NOT GGML_LLAMAFILE_DEFAULT)
set(GGML_LLAMAFILE_DEFAULT OFF)
endif()
if (NOT GGML_CUDA_GRAPHS_DEFAULT)
set(GGML_CUDA_GRAPHS_DEFAULT OFF)
endif()
# general
option(GGML_STATIC "ggml: static link libraries" OFF)
option(GGML_NATIVE "ggml: enable -march=native flag" ${GGML_NATIVE_DEFAULT})
option(GGML_NATIVE "ggml: enable -march=native flag" ON)
option(GGML_LTO "ggml: enable link time optimization" OFF)
option(GGML_CCACHE "ggml: use ccache if available" ON)
@ -85,7 +70,7 @@ option(GGML_SANITIZE_ADDRESS "ggml: enable address sanitizer" OFF)
option(GGML_SANITIZE_UNDEFINED "ggml: enable undefined sanitizer" OFF)
# instruction set specific
if (GGML_NATIVE OR NOT GGML_NATIVE_DEFAULT)
if (GGML_NATIVE)
set(INS_ENB OFF)
else()
set(INS_ENB ON)
@ -119,13 +104,11 @@ option(GGML_ACCELERATE "ggml: enable Accelerate framework"
option(GGML_BLAS "ggml: use BLAS" ${GGML_BLAS_DEFAULT})
set(GGML_BLAS_VENDOR ${GGML_BLAS_VENDOR_DEFAULT} CACHE STRING
"ggml: BLAS library vendor")
option(GGML_LLAMAFILE "ggml: use LLAMAFILE" ${GGML_LLAMAFILE_DEFAULT})
option(GGML_LLAMAFILE "ggml: use ggml SGEMM" OFF)
option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
@ -136,16 +119,14 @@ set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM" OFF)
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
option(GGML_CURL "ggml: use libcurl to download model from an URL" OFF)
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug output" OFF)
option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF)
option(GGML_VULKAN_PERF "ggml: enable Vulkan perf output" OFF)
option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF)
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
@ -211,20 +192,13 @@ endif ()
include(GNUInstallDirs)
include(CMakePackageConfigHelpers)
# all public headers
set(GGML_PUBLIC_HEADERS
include/ggml.h
include/ggml-alloc.h
include/ggml-backend.h
include/ggml-blas.h
include/ggml-cann.h
include/ggml-cuda.h
include/ggml.h
include/ggml-kompute.h
include/ggml-metal.h
include/ggml-rpc.h
include/ggml-sycl.h
include/ggml-vulkan.h)
"${GGML_HEADERS_CUDA}"
"${GGML_HEADERS_METAL}"
"${GGML_HEADERS_EXTRA}")
set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
#if (GGML_METAL)

View File

@ -7,8 +7,8 @@ extern "C" {
#endif
typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
typedef struct ggml_backend * ggml_backend_t;
// Tensor allocator
struct ggml_tallocr {
@ -24,7 +24,7 @@ GGML_API void ggml_tallocr_alloc(struct ggml_tallocr * talloc, st
// Graph allocator
/*
Example usage:
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
// optional: create a worst-case graph and reserve the buffers to avoid reallocations
ggml_gallocr_reserve(galloc, build_graph(max_batch));

View File

@ -12,52 +12,41 @@ extern "C" {
typedef struct ggml_backend_event * ggml_backend_event_t;
typedef struct ggml_backend * ggml_backend_t;
typedef void * ggml_backend_graph_plan_t;
typedef struct ggml_backend_reg * ggml_backend_reg_t;
typedef struct ggml_backend_device * ggml_backend_dev_t;
//
// Backend buffer type
//
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API ggml_backend_dev_t ggml_backend_buft_get_device (ggml_backend_buffer_type_t buft);
//
// Backend buffer
//
// buffer type
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer
enum ggml_backend_buffer_usage {
GGML_BACKEND_BUFFER_USAGE_ANY = 0,
GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1,
GGML_BACKEND_BUFFER_USAGE_COMPUTE = 2,
};
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API enum ggml_backend_buffer_usage ggml_backend_buffer_get_usage (ggml_backend_buffer_t buffer);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer);
//
// Backend (stream)
// Backend
//
GGML_API ggml_guid_t ggml_backend_guid(ggml_backend_t backend);
@ -72,10 +61,8 @@ extern "C" {
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// "offset" refers to the offset of the tensor data for setting/getting data
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API GGML_CALL void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
@ -85,118 +72,64 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
// NOTE: will be removed, use device version instead
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
// asynchronous copy
// the copy is performed after all the currently queued operations in backend_src
// backend_dst will wait for the copy to complete before performing other operations
// automatic fallback to sync copy if async is not supported
GGML_API void ggml_backend_tensor_copy_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, struct ggml_tensor * src, struct ggml_tensor * dst);
GGML_API ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend);
// events
GGML_API ggml_backend_event_t ggml_backend_event_new (ggml_backend_t backend);
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
//
// Events
// CPU backend
//
GGML_API ggml_backend_event_t ggml_backend_event_new(ggml_backend_dev_t device);
GGML_API void ggml_backend_event_free(ggml_backend_event_t event);
GGML_API void ggml_backend_event_record(ggml_backend_event_t event, ggml_backend_t backend);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
GGML_API void ggml_backend_event_wait(ggml_backend_t backend, ggml_backend_event_t event);
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
//
// Backend device
//
GGML_API GGML_CALL bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
enum ggml_backend_dev_type {
GGML_BACKEND_DEVICE_TYPE_CPU,
GGML_BACKEND_DEVICE_TYPE_GPU,
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
};
// Create a backend buffer from an existing pointer
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
// functionality supported by the device
struct ggml_backend_dev_caps {
// asynchronous operations
bool async;
// pinned host buffer
bool host_buffer;
// event synchronization
bool events;
};
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
// all the device properties
struct ggml_backend_dev_props {
const char * name;
const char * description;
size_t memory_free;
size_t memory_total;
enum ggml_backend_dev_type type;
struct ggml_backend_dev_caps caps;
};
GGML_API const char * ggml_backend_dev_name(ggml_backend_dev_t device);
GGML_API const char * ggml_backend_dev_description(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_memory(ggml_backend_dev_t device, size_t * free, size_t * total);
GGML_API enum ggml_backend_dev_type ggml_backend_dev_type(ggml_backend_dev_t device);
GGML_API void ggml_backend_dev_get_props(ggml_backend_dev_t device, struct ggml_backend_dev_props * props);
GGML_API ggml_backend_reg_t ggml_backend_dev_backend_reg(ggml_backend_dev_t device);
GGML_API ggml_backend_t ggml_backend_dev_init(ggml_backend_dev_t device, const char * params);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_type_t ggml_backend_dev_host_buffer_type(ggml_backend_dev_t device);
GGML_API ggml_backend_buffer_t ggml_backend_dev_buffer_from_host_ptr(ggml_backend_dev_t device, void * ptr, size_t size, size_t max_tensor_size);
GGML_API bool ggml_backend_dev_supports_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
GGML_API bool ggml_backend_dev_supports_buft(ggml_backend_dev_t device, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_dev_offload_op(ggml_backend_dev_t device, const struct ggml_tensor * op);
//
// Backend (reg)
//
GGML_API const char * ggml_backend_reg_name(ggml_backend_reg_t reg);
GGML_API size_t ggml_backend_reg_dev_count(ggml_backend_reg_t reg);
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
// Functions that may be obtained using ggml_backend_reg_get_proc_address
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
//
// Backend registry
//
// Backend (reg) enumeration
GGML_API size_t ggml_backend_reg_count(void);
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
GGML_API ggml_backend_reg_t ggml_backend_reg_by_name(const char * name);
// The backend registry is a registry of all the available backends, and allows initializing backends in a generic way
// Device enumeration
GGML_API size_t ggml_backend_dev_count(void);
GGML_API ggml_backend_dev_t ggml_backend_dev_get(size_t index);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_name(const char * name);
GGML_API ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type);
// Direct backend (stream) initialization
// = ggml_backend_dev_init(ggml_backend_dev_by_name(name), params)
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);
GGML_API size_t ggml_backend_reg_get_count(void);
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
GGML_API const char * ggml_backend_reg_get_name(size_t i);
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
GGML_API ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size);
//
// Backend scheduler
//
// The backend scheduler allows for multiple backend devices to be used together
// The backend scheduler allows for multiple backends to be used together
// Handles compute buffer allocation, assignment of tensors to backends, and copying of tensors between backends
// The backends are selected based on:
// - the backend that supports the operation
@ -231,9 +164,9 @@ extern "C" {
}
*/
struct ggml_backend_sched;
typedef struct ggml_backend_sched * ggml_backend_sched_t;
// Evaluation callback for each node in the graph (set with ggml_backend_sched_set_eval_callback)
// when ask == true, the scheduler wants to know if the user wants to observe this node
// this allows the scheduler to batch nodes together in order to evaluate them in a single call
//
@ -247,7 +180,7 @@ extern "C" {
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // returns success
GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
GGML_API int ggml_backend_sched_get_n_backends(ggml_backend_sched_t sched);
GGML_API ggml_backend_t ggml_backend_sched_get_backend(ggml_backend_sched_t sched, int i);
@ -262,7 +195,7 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph); // returns success
GGML_API bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
@ -288,7 +221,7 @@ extern "C" {
GGML_API struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph);
GGML_API void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy);
typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
typedef bool (*GGML_CALL ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data);
// Compare the output of two backends
GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data);
@ -297,26 +230,6 @@ extern "C" {
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
//
// CPU backend
//
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
#ifdef __cplusplus
}

View File

@ -9,13 +9,13 @@ extern "C" {
#endif
// backend API
GGML_API ggml_backend_t ggml_backend_blas_init(void);
GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
// number of threads used for conversion to float
// for openblas and blis, this will also set the number of threads used for blas operations
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
#ifdef __cplusplus

View File

@ -1,121 +0,0 @@
/*
* Copyright (c) 2023-2024 The ggml authors
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to
* deal in the Software without restriction, including without limitation the
* rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#pragma once
#include "ggml-backend.h"
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Maximum number of CANN devices supported.
*/
#define GGML_CANN_MAX_DEVICES 16
/**
* @brief Initializes the CANN backend for a specified device.
*
* This function initializes the CANN backend for the given device.
* It verifies the device index, allocates a context, and creates a backend
* instance.
*
* @param device The index of the device to initialize.
* @return A pointer to the initialized backend instance, or nullptr on failure.
*/
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
/**
* @brief Checks if a given backend is a CANN backend.
*
* This function verifies if the provided backend is a CANN backend by comparing
* its GUID with the CANN backend's GUID.
*
* @param backend The backend instance to check.
* @return True if the backend is a CANN backend, false otherwise.
*/
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
/**
* @brief Retrieves the CANN buffer type for a specified device.
*
* This function initializes and returns the buffer type interface associated
* with the given device. It ensures thread-safe access using a mutex.
*
* @param device The device index for which to retrieve the buffer type.
* @return A pointer to the buffer type interface for the specified device, or
* nullptr if the device index is out of range.
*/
GGML_API ggml_backend_buffer_type_t
ggml_backend_cann_buffer_type(int32_t device);
/**
* @brief Retrieves the number of CANN devices available.
*
* This function returns the number of CANN devices available based on
* information obtained from `ggml_cann_info()`.
*
* @return The number of CANN devices available.
*/
GGML_API int32_t ggml_backend_cann_get_device_count(void);
/**
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
*
* @return A pointer to the host buffer type interface.
*/
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
/**
* @brief Retrieves the description of a specific CANN device.
*
* This function sets the specified device, retrieves the SoC name,
* and writes it into the provided description buffer.
*
* @param device The device index to retrieve the description for.
* @param description Pointer to a buffer where the description will be written.
* @param description_size Size of the description buffer.
*/
GGML_API void ggml_backend_cann_get_device_description(
int32_t device, char* description, size_t description_size);
/**
* @brief Retrieves the memory information of a specific CANN device.
*
* This function sets the specified device, retrieves the free and total
* memory information of the specified type (ACL_HBM_MEM), and stores them
* in the provided pointers.
*
* @param device The device index to retrieve memory information for.
* @param free Pointer to a variable where the free memory size will be stored.
* @param total Pointer to a variable where the total memory size will be
* stored.
*/
GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
size_t* free,
size_t* total);
#ifdef __cplusplus
}
#endif

View File

@ -3,45 +3,42 @@
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS"
#elif defined(GGML_USE_MUSA)
#define GGML_CUDA_NAME "MUSA"
#define GGML_CUBLAS_NAME "muBLAS"
#else
#define GGML_CUDA_NAME "CUDA"
#define GGML_CUBLAS_NAME "cuBLAS"
#endif
#ifdef __cplusplus
extern "C" {
#endif
#define GGML_CUDA_MAX_DEVICES 16
// backend API
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API int ggml_backend_cuda_get_device_count(void);
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_API void ggml_backend_cuda_log_set_callback(ggml_log_callback log_callback, void * user_data);
#ifdef __cplusplus
}
#endif

View File

@ -1,5 +1,3 @@
// Note: this description is outdated
//
// An interface allowing to compute ggml_cgraph with Metal
//
// This is a fully functional interface that extends ggml with GPU support for Apple devices.
@ -27,6 +25,9 @@
#include <stddef.h>
#include <stdbool.h>
// max memory buffers that can be mapped to the device
#define GGML_METAL_MAX_BUFFERS 64
struct ggml_tensor;
struct ggml_cgraph;
@ -39,15 +40,17 @@ extern "C" {
// user-code should use only these functions
//
GGML_API void ggml_backend_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
GGML_API ggml_backend_t ggml_backend_metal_init(void);
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family
// ideally, the user code should be doing these checks
@ -60,3 +63,4 @@ GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
#ifdef __cplusplus
}
#endif

View File

@ -10,14 +10,14 @@ extern "C" {
#define GGML_RPC_MAX_SERVERS 16
// backend API
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_API GGML_CALL ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API GGML_CALL bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API GGML_CALL void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
GGML_API GGML_CALL void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
#ifdef __cplusplus
}

View File

@ -23,20 +23,20 @@ GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API int ggml_backend_sycl_get_device_count();
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API GGML_CALL int ggml_backend_sycl_get_device_count();
GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
// SYCL doesn't support registering host memory, keep here for reference
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
// GGML_API GGML_CALL bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API GGML_CALL void ggml_backend_sycl_unregister_host_buffer(void * buffer);
#ifdef __cplusplus
}
#endif

View File

@ -13,16 +13,16 @@ extern "C" {
GGML_API void ggml_vk_instance_init(void);
// backend API
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_API GGML_CALL ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API int ggml_backend_vk_get_device_count(void);
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API GGML_CALL bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API GGML_CALL int ggml_backend_vk_get_device_count(void);
GGML_API GGML_CALL void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API GGML_CALL void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
#ifdef __cplusplus
}

View File

@ -187,6 +187,16 @@
# define GGML_API
#endif
#ifdef GGML_MULTIPLATFORM
# if defined(_WIN32)
# define GGML_CALL
# else
# define GGML_CALL __attribute__((__ms_abi__))
# endif
#else
# define GGML_CALL
#endif
// TODO: support for clang
#ifdef __GNUC__
# define GGML_DEPRECATED(func, hint) func __attribute__((deprecated(hint)))
@ -210,24 +220,21 @@
#include <stdio.h>
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 2
#define GGML_FILE_VERSION 1
#define GGML_QNT_VERSION 2 // bump this on quantization format changes
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
#define GGML_MAX_DIMS 4
#define GGML_MAX_PARAMS 2048
#define GGML_MAX_CONTEXTS 64
#define GGML_MAX_SRC 10
#define GGML_MAX_N_THREADS 512
#define GGML_MAX_OP_PARAMS 64
#ifndef GGML_MAX_NAME
# define GGML_MAX_NAME 64
#define GGML_MAX_NAME 64
#endif
#define GGML_MAX_OP_PARAMS 64
#define GGML_DEFAULT_N_THREADS 4
#define GGML_DEFAULT_GRAPH_SIZE 2048
#if UINTPTR_MAX == 0xFFFFFFFF
#define GGML_MEM_ALIGN 4
#else
@ -237,8 +244,6 @@
#define GGML_EXIT_SUCCESS 0
#define GGML_EXIT_ABORTED 1
#define GGML_ROPE_TYPE_NEOX 2
#define GGUF_MAGIC "GGUF"
#define GGUF_VERSION 3
@ -249,27 +254,26 @@
#define GGML_PAD(x, n) (((x) + (n) - 1) & ~((n) - 1))
#define GGML_ASSERT(x) \
do { \
if (!(x)) { \
fflush(stdout); \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
ggml_print_backtrace(); \
abort(); \
} \
} while (0)
#ifndef NDEBUG
# define GGML_UNREACHABLE() do { fprintf(stderr, "statement should be unreachable\n"); abort(); } while(0)
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
#elif defined(__GNUC__)
# define GGML_UNREACHABLE() __builtin_unreachable()
#define GGML_UNREACHABLE() __builtin_unreachable()
#elif defined(_MSC_VER)
# define GGML_UNREACHABLE() __assume(0)
#define GGML_UNREACHABLE() __assume(0)
#else
# define GGML_UNREACHABLE() ((void) 0)
#define GGML_UNREACHABLE() ((void) 0)
#endif
#ifdef __cplusplus
# define GGML_NORETURN [[noreturn]]
#elif defined(_MSC_VER)
# define GGML_NORETURN __declspec(noreturn)
#else
# define GGML_NORETURN _Noreturn
#endif
#define GGML_ABORT(...) ggml_abort(__FILE__, __LINE__, __VA_ARGS__)
#define GGML_ASSERT(x) if (!(x)) GGML_ABORT("GGML_ASSERT(%s) failed", #x)
// used to copy the number of elements and stride in bytes of tensors into local variables.
// main purpose is to reduce code duplication and improve readability.
//
@ -318,9 +322,6 @@
extern "C" {
#endif
GGML_NORETURN GGML_ATTRIBUTE_FORMAT(3, 4)
GGML_API void ggml_abort(const char * file, int line, const char * fmt, ...);
enum ggml_status {
GGML_STATUS_ALLOC_FAILED = -2,
GGML_STATUS_FAILED = -1,
@ -329,7 +330,7 @@ extern "C" {
};
// get ggml_status name string
GGML_API const char * ggml_status_to_string(enum ggml_status status);
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
// ieee 754-2008 half-precision float16
// todo: make this not an integral type
@ -344,12 +345,10 @@ extern "C" {
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
GGML_API void ggml_fp32_to_bf16_row_ref(const float *, ggml_bf16_t *, int64_t);
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
struct ggml_object;
struct ggml_context;
struct ggml_cgraph;
// NOTE: always add types at the end of the enum to keep backward compatibility
enum ggml_type {
@ -384,11 +383,6 @@ extern "C" {
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30,
GGML_TYPE_Q4_0_4_4 = 31,
GGML_TYPE_Q4_0_4_8 = 32,
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_COUNT,
};
@ -430,9 +424,6 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
};
// available tensor operations:
@ -449,13 +440,10 @@ extern "C" {
GGML_OP_SQR,
GGML_OP_SQRT,
GGML_OP_LOG,
GGML_OP_SIN,
GGML_OP_COS,
GGML_OP_SUM,
GGML_OP_SUM_ROWS,
GGML_OP_MEAN,
GGML_OP_ARGMAX,
GGML_OP_COUNT_EQUAL,
GGML_OP_REPEAT,
GGML_OP_REPEAT_BACK,
GGML_OP_CONCAT,
@ -489,11 +477,9 @@ extern "C" {
GGML_OP_CLAMP,
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
GGML_OP_IM2COL_BACK,
GGML_OP_CONV_TRANSPOSE_2D,
GGML_OP_POOL_1D,
GGML_OP_POOL_2D,
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_ARANGE,
@ -509,7 +495,6 @@ extern "C" {
GGML_OP_WIN_UNPART,
GGML_OP_GET_REL_POS,
GGML_OP_ADD_REL_POS,
GGML_OP_RWKV_WKV,
GGML_OP_UNARY,
@ -526,7 +511,6 @@ extern "C" {
GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_OPT_STEP_ADAMW,
GGML_OP_COUNT,
};
@ -545,7 +529,6 @@ extern "C" {
GGML_UNARY_OP_SILU,
GGML_UNARY_OP_HARDSWISH,
GGML_UNARY_OP_HARDSIGMOID,
GGML_UNARY_OP_EXP,
GGML_UNARY_OP_COUNT,
};
@ -557,25 +540,35 @@ extern "C" {
};
enum ggml_log_level {
GGML_LOG_LEVEL_NONE = 0,
GGML_LOG_LEVEL_INFO = 1,
GGML_LOG_LEVEL_WARN = 2,
GGML_LOG_LEVEL_ERROR = 3,
GGML_LOG_LEVEL_DEBUG = 4,
GGML_LOG_LEVEL_CONT = 5, // continue previous log
GGML_LOG_LEVEL_ERROR = 2,
GGML_LOG_LEVEL_WARN = 3,
GGML_LOG_LEVEL_INFO = 4,
GGML_LOG_LEVEL_DEBUG = 5
};
// this tensor...
enum ggml_tensor_flag {
GGML_TENSOR_FLAG_INPUT = 1, // ...is an input for the GGML compute graph
GGML_TENSOR_FLAG_OUTPUT = 2, // ...is an output for the GGML compute graph
GGML_TENSOR_FLAG_PARAM = 4, // ...contains trainable parameters
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
GGML_TENSOR_FLAG_INPUT = 1,
GGML_TENSOR_FLAG_OUTPUT = 2,
GGML_TENSOR_FLAG_PARAM = 4,
};
// ggml object
struct ggml_object {
size_t offs;
size_t size;
struct ggml_object * next;
enum ggml_object_type type;
char padding[4];
};
static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
enum ggml_type type;
GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
@ -618,29 +611,6 @@ extern "C" {
// If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data);
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
@ -648,15 +618,39 @@ extern "C" {
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
enum ggml_cgraph_eval_order {
GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
GGML_CGRAPH_EVAL_ORDER_COUNT
};
struct ggml_hash_set {
size_t size;
struct ggml_tensor ** keys;
};
// computation graph
struct ggml_cgraph {
int size;
int n_nodes;
int n_leafs;
struct ggml_tensor ** nodes;
struct ggml_tensor ** grads;
struct ggml_tensor ** leafs;
struct ggml_hash_set visited_hash_table;
enum ggml_cgraph_eval_order order;
};
// scratch buffer
// TODO: deprecate and remove
struct ggml_scratch {
size_t offs;
size_t size;
@ -698,6 +692,8 @@ extern "C" {
GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void);
GGML_API void ggml_print_backtrace(void);
// accepts a UTF-8 path, even on Windows
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
@ -707,52 +703,50 @@ extern "C" {
GGML_API void ggml_print_object (const struct ggml_object * obj);
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
GGML_API int64_t ggml_nelements (const struct ggml_tensor * tensor);
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_pad(const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
GGML_API GGML_CALL int64_t ggml_nelements (const struct ggml_tensor * tensor);
GGML_API GGML_CALL int64_t ggml_nrows (const struct ggml_tensor * tensor);
GGML_API GGML_CALL size_t ggml_nbytes (const struct ggml_tensor * tensor);
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
GGML_API int64_t ggml_blck_size(enum ggml_type type);
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
GGML_API GGML_CALL int ggml_blck_size(enum ggml_type type);
GGML_API GGML_CALL size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
GGML_API GGML_CALL size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
GGML_DEPRECATED(
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
"use ggml_row_size() instead");
GGML_API const char * ggml_type_name(enum ggml_type type);
GGML_API const char * ggml_op_name (enum ggml_op op);
GGML_API const char * ggml_op_symbol(enum ggml_op op);
GGML_API GGML_CALL const char * ggml_type_name(enum ggml_type type);
GGML_API GGML_CALL const char * ggml_op_name (enum ggml_op op);
GGML_API const char * ggml_op_symbol(enum ggml_op op);
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
GGML_API const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
GGML_API const char * ggml_unary_op_name(enum ggml_unary_op op);
GGML_API GGML_CALL const char * ggml_op_desc(const struct ggml_tensor * t); // unary or op name
GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
GGML_API GGML_CALL size_t ggml_element_size(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_quantized(enum ggml_type type);
GGML_API GGML_CALL bool ggml_is_quantized(enum ggml_type type);
// TODO: temporary until model loading of ggml examples is refactored
GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_empty (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
GGML_API GGML_CALL bool ggml_is_transposed(const struct ggml_tensor * tensor);
GGML_API GGML_CALL bool ggml_is_permuted (const struct ggml_tensor * tensor);
GGML_API GGML_CALL bool ggml_is_empty (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
GGML_API bool ggml_is_contiguous (const struct ggml_tensor * tensor);
GGML_API bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
GGML_API bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
GGML_API GGML_CALL bool ggml_is_contiguous (const struct ggml_tensor * tensor);
GGML_API GGML_CALL bool ggml_is_contiguous_0(const struct ggml_tensor * tensor); // same as ggml_is_contiguous()
GGML_API GGML_CALL bool ggml_is_contiguous_1(const struct ggml_tensor * tensor); // contiguous for dims >= 1
GGML_API GGML_CALL bool ggml_is_contiguous_2(const struct ggml_tensor * tensor); // contiguous for dims >= 2
GGML_API bool ggml_are_same_shape (const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_are_same_stride(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
GGML_API bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
// use this to compute the memory overhead of a tensor
GGML_API size_t ggml_tensor_overhead(void);
@ -760,9 +754,8 @@ extern "C" {
// main
GGML_API struct ggml_context * ggml_init (struct ggml_init_params params);
GGML_API void ggml_reset(struct ggml_context * ctx);
GGML_API void ggml_free (struct ggml_context * ctx);
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
GGML_API void ggml_free(struct ggml_context * ctx);
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
@ -839,7 +832,7 @@ extern "C" {
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
GGML_API GGML_CALL enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
@ -960,22 +953,6 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sin(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_sin_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_cos(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_cos_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// return scalar
GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx,
@ -996,12 +973,6 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// count number of equal elements in a and b
GGML_API struct ggml_tensor * ggml_count_equal(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// if a is the same shape as b, and a is not parameter, return a
// otherwise, return a new tensor: repeat(a) to fit in b
GGML_API struct ggml_tensor * ggml_repeat(
@ -1132,14 +1103,6 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_exp(
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_exp_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a);
// normalize along rows
GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx,
@ -1163,17 +1126,16 @@ extern "C" {
// group normalize along ne0*ne1*n_groups
// used in stable-diffusion
// TODO: eps is hardcoded to 1e-6 for now
GGML_API struct ggml_tensor * ggml_group_norm(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_groups,
float eps);
int n_groups);
GGML_API struct ggml_tensor * ggml_group_norm_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_groups,
float eps);
int n_groups);
// a - x
// b - dy
@ -1235,7 +1197,7 @@ extern "C" {
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset); // in bytes
size_t offset);
// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_inplace(
@ -1245,19 +1207,19 @@ extern "C" {
size_t nb1,
size_t nb2,
size_t nb3,
size_t offset); // in bytes
size_t offset);
GGML_API struct ggml_tensor * ggml_set_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset); // in bytes
size_t offset);
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t offset); // in bytes
size_t offset);
// b -> view(a,offset,nb1,nb2,3), return modified a
GGML_API struct ggml_tensor * ggml_set_2d(
@ -1265,7 +1227,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset); // in bytes
size_t offset);
// b -> view(a,offset,nb1,nb2,3), return view(a)
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
@ -1273,7 +1235,7 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b,
size_t nb1,
size_t offset); // in bytes
size_t offset);
// a -> b, return view(b)
GGML_API struct ggml_tensor * ggml_cpy(
@ -1408,14 +1370,14 @@ extern "C" {
// supports 3D: a->ne[2] == b->ne[1]
GGML_API struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx,
struct ggml_tensor * a, // data
struct ggml_tensor * b); // row indices
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_get_rows_back(
struct ggml_context * ctx,
struct ggml_tensor * a, // gradients of ggml_get_rows result
struct ggml_tensor * b, // row indices
struct ggml_tensor * c); // data for ggml_get_rows, only used for its shape
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c);
GGML_API struct ggml_tensor * ggml_diag(
struct ggml_context * ctx,
@ -1476,10 +1438,11 @@ extern "C" {
struct ggml_tensor * b);
// rotary position embedding
// if (mode & 1) - skip n_past elements (NOT SUPPORTED)
// if (mode & GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
// if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
// if mode & 2 == 1, GPT-NeoX style
//
// b is an int32 vector with size a->ne[2], it contains the positions
// c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -1496,7 +1459,6 @@ extern "C" {
int mode);
// custom RoPE
// c is freq factors (e.g. phi3-128k), (optional)
GGML_API struct ggml_tensor * ggml_rope_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -1559,16 +1521,16 @@ extern "C" {
"use ggml_rope_ext_inplace instead");
// compute correction dims for YaRN RoPE scaling
void ggml_rope_yarn_corr_dims(
GGML_CALL void ggml_rope_yarn_corr_dims(
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
// rotary position embedding backward, i.e compute dx from dy
// a - dy
GGML_API struct ggml_tensor * ggml_rope_back(
struct ggml_context * ctx,
struct ggml_tensor * a, // gradients of ggml_rope result
struct ggml_tensor * b, // positions
struct ggml_tensor * c, // freq factors
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c,
int n_dims,
int mode,
int n_ctx_orig,
@ -1587,49 +1549,34 @@ extern "C" {
float min,
float max);
// im2col
// converts data into a format that effectively results in a convolution when combined with matrix multiplication
GGML_API struct ggml_tensor * ggml_im2col(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1, // dilation dimension 1
bool is_2D,
enum ggml_type dst_type);
GGML_API struct ggml_tensor * ggml_im2col_back(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // gradient of im2col output
int64_t * ne, // shape of im2col input
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1, // dilation dimension 1
bool is_2D);
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1,
bool is_2D,
enum ggml_type dst_type);
GGML_API struct ggml_tensor * ggml_conv_depthwise_2d(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1);
GGML_API struct ggml_tensor * ggml_conv_1d(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0, // stride
int p0, // padding
int d0); // dilation
@ -1638,29 +1585,29 @@ extern "C" {
// alias for ggml_conv_1d(a, b, s, a->ne[0]/2, d)
GGML_API struct ggml_tensor* ggml_conv_1d_ph(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s, // stride
int d); // dilation
struct ggml_tensor * a,
struct ggml_tensor * b,
int s,
int d);
GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride
int p0, // padding
int d0); // dilation
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int p0,
int d0);
GGML_API struct ggml_tensor * ggml_conv_2d(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride dimension 0
int s1, // stride dimension 1
int p0, // padding dimension 0
int p1, // padding dimension 1
int d0, // dilation dimension 0
int d1); // dilation dimension 1
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int s1,
int p0,
int p1,
int d0,
int d1);
// kernel size is a->ne[0] x a->ne[1]
@ -1722,18 +1669,6 @@ extern "C" {
float p0,
float p1);
GGML_API struct ggml_tensor * ggml_pool_2d_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * af, // "a"/input used in forward pass
enum ggml_op_pool op,
int k0,
int k1,
int s0,
int s1,
float p0,
float p1);
// nearest interpolate
// multiplies ne0 and ne1 by scale factor
// used in stable-diffusion
@ -1808,8 +1743,7 @@ extern "C" {
struct ggml_tensor * v,
struct ggml_tensor * mask,
float scale,
float max_bias,
float logit_softcap);
float max_bias);
GGML_API void ggml_flash_attn_ext_set_prec(
struct ggml_tensor * a,
@ -1826,8 +1760,10 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_ssm_conv(
struct ggml_context * ctx,
struct ggml_tensor * sx,
struct ggml_tensor * c);
struct ggml_tensor * s,
struct ggml_tensor * x,
struct ggml_tensor * c,
struct ggml_tensor * sq);
GGML_API struct ggml_tensor * ggml_ssm_scan(
struct ggml_context * ctx,
@ -1836,7 +1772,8 @@ extern "C" {
struct ggml_tensor * dt,
struct ggml_tensor * A,
struct ggml_tensor * B,
struct ggml_tensor * C);
struct ggml_tensor * C,
struct ggml_tensor * sq);
// partition into non-overlapping windows with padding if needed
// example:
@ -1888,15 +1825,6 @@ extern "C" {
struct ggml_tensor * pw,
struct ggml_tensor * ph);
GGML_API struct ggml_tensor * ggml_rwkv_wkv(
struct ggml_context * ctx,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * r,
struct ggml_tensor * tf,
struct ggml_tensor * td,
struct ggml_tensor * state);
// custom operators
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
@ -1980,8 +1908,7 @@ extern "C" {
typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
#define GGML_N_TASKS_MAX (-1)
// n_tasks == GGML_N_TASKS_MAX means to use max number of tasks
#define GGML_N_TASKS_MAX -1
GGML_API struct ggml_tensor * ggml_map_custom1(
struct ggml_context * ctx,
@ -2034,84 +1961,44 @@ extern "C" {
// loss function
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
struct ggml_context * ctx,
struct ggml_tensor * a, // logits
struct ggml_tensor * b); // labels
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
struct ggml_context * ctx,
struct ggml_tensor * a, // logits
struct ggml_tensor * b, // labels
struct ggml_tensor * c); // gradients of cross_entropy_loss result
// AdamW optimizer step
// Paper: https://arxiv.org/pdf/1711.05101v3.pdf
// PyTorch: https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html
GGML_API struct ggml_tensor * ggml_opt_step_adamw(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * grad,
float alpha,
float beta1,
float beta2,
float eps,
float wd); // weight decay
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c);
//
// automatic differentiation
//
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
GGML_API void ggml_set_param(
struct ggml_context * ctx,
struct ggml_tensor * tensor);
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
GGML_API void ggml_build_opt_adamw(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
float alpha,
float beta1,
float beta2,
float eps,
float wd); // weight decay
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
// graph allocation in a context
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // set regular grads + optimizer momenta to 0, set loss grad to 1
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
GGML_API int ggml_graph_size (struct ggml_cgraph * cgraph);
GGML_API struct ggml_tensor * ggml_graph_node (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
GGML_API struct ggml_tensor ** ggml_graph_nodes (struct ggml_cgraph * cgraph);
GGML_API int ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
GGML_API struct ggml_cgraph * ggml_new_graph_custom (struct ggml_context * ctx, size_t size, bool grads);
GGML_API struct ggml_cgraph * ggml_graph_dup (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API struct ggml_cgraph ggml_graph_view (struct ggml_cgraph * cgraph, int i0, int i1);
GGML_API void ggml_graph_cpy (struct ggml_cgraph * src, struct ggml_cgraph * dst);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); // zero grads
GGML_API void ggml_graph_clear (struct ggml_cgraph * cgraph);
GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/);
GGML_API enum ggml_status ggml_graph_compute ( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
@ -2175,10 +2062,6 @@ extern "C" {
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
// optimization parameters
//
// see ggml.c (ggml_opt_default_params) for default values
@ -2504,16 +2387,10 @@ extern "C" {
GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_riscv_v (void);
GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_rpc (void);
GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_matmul_int8(void);
GGML_API int ggml_cpu_has_cann (void);
GGML_API int ggml_cpu_has_llamafile (void);
// get the sve vector length in bytes
GGML_API int ggml_cpu_get_sve_cnt(void);
//
// Internal types and functions exposed for tests and benchmarks
@ -2527,31 +2404,20 @@ extern "C" {
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef struct {
const char * type_name;
int64_t blck_size;
int64_t blck_size_interleave; // interleave elements in blocks
size_t type_size;
bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_ref;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
const char * type_name;
int blck_size;
size_t type_size;
bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_reference;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously;
} ggml_type_traits_t;
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);

View File

@ -26,9 +26,6 @@ if (NOT MSVC)
endif()
endif()
unset(GGML_EXTRA_LIBS_PRIVATE)
unset(GGML_EXTRA_LIBS_PUBLIC)
if (APPLE AND GGML_ACCELERATE)
find_library(ACCELERATE_FRAMEWORK Accelerate)
if (ACCELERATE_FRAMEWORK)
@ -38,7 +35,7 @@ if (APPLE AND GGML_ACCELERATE)
add_compile_definitions(ACCELERATE_NEW_LAPACK)
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
list(APPEND GGML_EXTRA_LIBS_PRIVATE ${ACCELERATE_FRAMEWORK})
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${ACCELERATE_FRAMEWORK})
else()
message(WARNING "Accelerate framework not found")
endif()
@ -90,7 +87,7 @@ if (GGML_METAL)
COMMENT "Generate assembly for embedded Metal library"
)
list(APPEND GGML_SOURCES_METAL ${METALLIB_EMBED_ASM})
set(GGML_SOURCES_METAL ${GGML_SOURCES_METAL} ${METALLIB_EMBED_ASM})
else()
if (GGML_METAL_SHADER_DEBUG)
# custom command to do the following:
@ -135,24 +132,13 @@ if (GGML_METAL)
)
endif() # GGML_METAL_EMBED_LIBRARY
list(APPEND GGML_EXTRA_LIBS_PRIVATE
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS}
${FOUNDATION_LIBRARY}
${METAL_FRAMEWORK}
${METALKIT_FRAMEWORK}
)
endif()
if (GGML_MUSA)
set(CMAKE_C_COMPILER clang)
set(CMAKE_C_EXTENSIONS OFF)
set(CMAKE_CXX_COMPILER clang++)
set(CMAKE_CXX_EXTENSIONS OFF)
set(GGML_CUDA ON)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_MUSA)
endif()
if (GGML_OPENMP)
find_package(OpenMP)
if (OpenMP_FOUND)
@ -160,12 +146,7 @@ if (GGML_OPENMP)
add_compile_definitions(GGML_USE_OPENMP)
list(APPEND GGML_EXTRA_LIBS_PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
if (GGML_MUSA)
list(APPEND GGML_EXTRA_INCLUDES "/usr/lib/llvm-10/include/openmp")
list(APPEND GGML_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-10/lib/libomp.so")
endif()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
else()
message(WARNING "OpenMP not found")
endif()
@ -247,8 +228,8 @@ if (GGML_BLAS)
set(GGML_HEADERS_BLAS ../include/ggml-blas.h)
set(GGML_SOURCES_BLAS ggml-blas.cpp)
list(APPEND GGML_EXTRA_LIBS_PRIVATE ${BLAS_LIBRARIES})
list(APPEND GGML_EXTRA_INCLUDES ${BLAS_INCLUDE_DIRS})
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ${BLAS_LIBRARIES})
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${BLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
@ -257,24 +238,18 @@ if (GGML_BLAS)
endif()
if (GGML_LLAMAFILE)
message(STATUS "Using llamafile")
message(STATUS "Using ggml SGEMM")
add_compile_definitions(GGML_USE_LLAMAFILE)
set(GGML_HEADERS_LLAMAFILE llamafile/sgemm.h)
set(GGML_SOURCES_LLAMAFILE llamafile/sgemm.cpp)
set(GGML_HEADERS_LLAMAFILE sgemm.h)
set(GGML_SOURCES_LLAMAFILE sgemm.cpp)
endif()
if (GGML_CUDA)
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
if (GGML_MUSA)
list(APPEND CMAKE_MODULE_PATH "/usr/local/musa/cmake/")
find_package(MUSAToolkit)
set(CUDAToolkit_FOUND ${MUSAToolkit_FOUND})
else()
find_package(CUDAToolkit)
endif()
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "CUDA found")
@ -293,11 +268,7 @@ if (GGML_CUDA)
endif()
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
if (GGML_MUSA)
set(CMAKE_CUDA_COMPILER ${MUSAToolkit_MCC_EXECUTABLE})
else()
enable_language(CUDA)
endif()
enable_language(CUDA)
file(GLOB GGML_HEADERS_CUDA "ggml-cuda/*.cuh")
list(APPEND GGML_HEADERS_CUDA "../include/ggml-cuda.h")
@ -324,15 +295,21 @@ if (GGML_CUDA)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CUDA)
# TODO: for now CUDA graphs should be used only with llama.cpp
# https://github.com/ggerganov/whisper.cpp/issues/2258
message(STATUS "CMAKE_PROJECT_NAME: ${CMAKE_PROJECT_NAME}")
if (CMAKE_PROJECT_NAME STREQUAL "llama.cpp")
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
message(STATUS "GGML_CUDA_USE_GRAPHS enabled")
else()
message(STATUS "GGML_CUDA_USE_GRAPHS disabled")
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
add_compile_definitions(K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
if (GGML_CUDA_GRAPHS)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
endif()
if (GGML_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
@ -361,40 +338,21 @@ if (GGML_CUDA)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (GGML_MUSA)
set_source_files_properties(${GGML_SOURCES_CUDA} PROPERTIES LANGUAGE CXX)
foreach(SOURCE ${GGML_SOURCES_CUDA})
set_property(SOURCE ${SOURCE} PROPERTY COMPILE_FLAGS "-x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22")
endforeach()
endif()
if (GGML_STATIC)
if (WIN32)
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
if (GGML_MUSA)
list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart_static MUSA::mublas_static)
else()
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
endif()
else()
if (GGML_MUSA)
list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musart MUSA::mublas)
else()
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
if (GGML_CUDA_NO_VMM)
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
else()
if (GGML_MUSA)
list(APPEND GGML_EXTRA_LIBS_PRIVATE MUSA::musa_driver) # required by muDeviceGetAttribute(), muMemGetAllocationGranularity(...), ...
else()
list(APPEND GGML_EXTRA_LIBS_PRIVATE CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
else()
message(WARNING "CUDA not found")
@ -488,17 +446,13 @@ if (GGML_HIPBLAS)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
if (GGML_CUDA_FORCE_CUBLAS)
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
endif()
if (GGML_CUDA_NO_PEER_COPY)
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
endif()
if (CXX_IS_HIPCC)
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
list(APPEND GGML_EXTRA_LIBS_PRIVATE hip::device)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} hip::device)
else()
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
endif()
@ -507,34 +461,27 @@ if (GGML_HIPBLAS)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
list(APPEND GGML_EXTRA_LIBS_PUBLIC hip::host roc::rocblas roc::hipblas)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} PUBLIC hip::host roc::rocblas roc::hipblas)
endif()
if (GGML_SYCL)
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA|AMD)$")
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL, NVIDIA, or AMD")
if (NOT GGML_SYCL_TARGET MATCHES "^(INTEL|NVIDIA)$")
message(FATAL_ERROR "Invalid backend chosen, supported options are INTEL or NVIDIA")
endif()
check_cxx_compiler_flag("-fsycl" SUPPORTS_SYCL)
if (DEFINED ENV{ONEAPI_ROOT})
message(STATUS "Using oneAPI Release SYCL compiler (icpx).")
elseif(SUPPORTS_SYCL)
message(WARNING "Using open-source SYCL compiler (clang++). Didn't detect ENV {ONEAPI_ROOT}.
If you expected the oneAPI Release compiler, please install oneAPI & source it, like:
source /opt/intel/oneapi/setvars.sh")
else()
message(FATAL_ERROR, "C++ compiler lacks SYCL support.")
if ( NOT DEFINED ENV{ONEAPI_ROOT})
message(FATAL_ERROR "Not detect ENV {ONEAPI_ROOT}, please install oneAPI & source it, like: source /opt/intel/oneapi/setvars.sh")
endif()
message(STATUS "SYCL found")
#todo: AOT
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
message(STATUS "SYCL found")
list(APPEND GGML_CDEF_PUBLIC GGML_USE_SYCL)
if (GGML_SYCL_F16)
if (GGML_SYCL_TARGET STREQUAL "AMD")
message(WARNING "AMD target does not entirely support FP16 in the SYCL backend.")
endif()
add_compile_definitions(GGML_SYCL_F16)
endif()
@ -542,18 +489,12 @@ if (GGML_SYCL)
add_compile_definitions(GGML_SYCL_FORCE_MMQ)
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing -fsycl")
add_compile_options(-I./) #include DPCT
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-narrowing")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O3")
if (GGML_SYCL_TARGET STREQUAL "NVIDIA")
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
# INFO: Allowed Sub_group_sizes are not consistent through all
# hip targets. For example, 64 is used for certain models, but the backend
# does not support it.
# Target archs tested working: gfx1030, gfx1031, (Only tested sub_group_size = 32)
add_compile_definitions(GGML_SYCL_WARP_SIZE=32)
else()
add_compile_definitions(GGML_SYCL_WARP_SIZE=16)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
endif()
file(GLOB GGML_HEADERS_SYCL "ggml-sycl/*.hpp")
@ -562,35 +503,16 @@ if (GGML_SYCL)
file(GLOB GGML_SOURCES_SYCL "ggml-sycl/*.cpp")
list(APPEND GGML_SOURCES_SYCL "ggml-sycl.cpp")
find_package(DNNL)
message("-- DNNL found:" ${DNNL_FOUND})
if (GGML_SYCL_TARGET STREQUAL "INTEL")
add_compile_definitions(GGML_SYCL_DNNL=${DNNL_FOUND})
else()
add_compile_definitions(GGML_SYCL_DNNL=0)
endif()
if (${DNNL_FOUND} AND GGML_SYCL_TARGET STREQUAL "INTEL")
list(APPEND GGML_EXTRA_LIBS_PRIVATE DNNL::dnnl)
endif()
if (WIN32)
find_package(IntelSYCL REQUIRED)
find_package(MKL REQUIRED)
list(APPEND GGML_EXTRA_LIBS_PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
else()
add_compile_options(-I/${SYCL_INCLUDE_DIR})
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -L${MKLROOT}/lib")
if (GGML_SYCL_TARGET STREQUAL "INTEL")
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=nvptx64-nvidia-cuda")
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
elseif (GGML_SYCL_TARGET STREQUAL "AMD")
if (GGML_SYCL_HIP_TARGET STREQUAL "")
message(ERROR "Can't enable SYCL hip backend, GGML_SYCL_HIP_TARGET has not been set.")
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${GGML_SYCL_HIP_TARGET}")
list(APPEND GGML_EXTRA_LIBS_PRIVATE sycl pthread m dl onemkl)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} -fsycl pthread m dl onemkl)
endif()
endif()
endif()
@ -601,7 +523,7 @@ if (GGML_RPC)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_RPC)
if (WIN32)
list(APPEND GGML_EXTRA_LIBS_PRIVATE ws2_32)
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} ws2_32)
endif()
set(GGML_HEADERS_RPC ../include/ggml-rpc.h)
@ -609,11 +531,14 @@ if (GGML_RPC)
endif()
if (GGML_VULKAN)
find_package(Vulkan COMPONENTS glslc REQUIRED)
find_package(Vulkan)
if (Vulkan_FOUND)
message(STATUS "Vulkan found")
set(GGML_HEADERS_VULKAN ../include/ggml-vulkan.h)
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_VULKAN)
# Workaround to the "can't dereference invalidated vector iterator" bug in clang-cl debug build
@ -634,14 +559,6 @@ if (GGML_VULKAN)
add_compile_definitions(GGML_VULKAN_MEMORY_DEBUG)
endif()
if (GGML_VULKAN_SHADER_DEBUG_INFO)
add_compile_definitions(GGML_VULKAN_SHADER_DEBUG_INFO)
endif()
if (GGML_VULKAN_PERF)
add_compile_definitions(GGML_VULKAN_PERF)
endif()
if (GGML_VULKAN_VALIDATE)
add_compile_definitions(GGML_VULKAN_VALIDATE)
endif()
@ -650,37 +567,7 @@ if (GGML_VULKAN)
add_compile_definitions(GGML_VULKAN_RUN_TESTS)
endif()
add_subdirectory(vulkan-shaders)
set (_ggml_vk_genshaders_cmd vulkan-shaders-gen)
set (_ggml_vk_header ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.hpp)
set (_ggml_vk_source ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan-shaders.cpp)
set (_ggml_vk_input_dir ${CMAKE_CURRENT_SOURCE_DIR}/vulkan-shaders)
set (_ggml_vk_output_dir ${CMAKE_CURRENT_BINARY_DIR}/vulkan-shaders.spv)
file(GLOB _ggml_vk_shader_deps "${_ggml_vk_input_dir}/*.comp")
add_custom_command(
OUTPUT ${_ggml_vk_header}
${_ggml_vk_source}
COMMAND ${_ggml_vk_genshaders_cmd}
--glslc ${Vulkan_GLSLC_EXECUTABLE}
--input-dir ${_ggml_vk_input_dir}
--output-dir ${_ggml_vk_output_dir}
--target-hpp ${_ggml_vk_header}
--target-cpp ${_ggml_vk_source}
--no-clean
DEPENDS ${_ggml_vk_shader_deps}
COMMENT "Generate vulkan shaders"
)
set(GGML_HEADERS_VULKAN ${CMAKE_CURRENT_SOURCE_DIR}/../include/ggml-vulkan.h ${_ggml_vk_header})
set(GGML_SOURCES_VULKAN ggml-vulkan.cpp ${_ggml_vk_source})
list(APPEND GGML_EXTRA_LIBS_PRIVATE Vulkan::Vulkan)
list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} Vulkan::Vulkan)
else()
message(WARNING "Vulkan not found")
endif()
@ -839,8 +726,8 @@ if (GGML_KOMPUTE)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_KOMPUTE)
list(APPEND GGML_EXTRA_LIBS_PRIVATE kompute)
list(APPEND GGML_EXTRA_INCLUDES ${CMAKE_CURRENT_BINARY_DIR})
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} kompute)
set(GGML_EXTRA_INCLUDES ${GGML_EXTRA_INCLUDES} ${CMAKE_CURRENT_BINARY_DIR})
else()
message(WARNING "Kompute not found")
endif()
@ -856,71 +743,6 @@ if (GGML_CPU_HBM)
target_link_libraries(ggml PUBLIC memkind)
endif()
if (GGML_CANN)
if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
endif()
if (CANN_INSTALL_DIR)
# Only Support Linux.
if (GGML_CANN)
if (NOT UNIX)
set(GGML_CANN OFF)
message(WARNING "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}. Turning off GGML_CANN")
endif()
endif()
# Supported platforms: x86-64, arm64
if (GGML_CANN)
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
else()
set(GGML_CANN OFF)
message(WARNING "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}. Turning off GGML_CANN")
endif()
endif()
# Set header and libs
if(GGML_CANN)
set(CANN_INCLUDE_DIRS
${CANN_INSTALL_DIR}/include
${CANN_INSTALL_DIR}/include/aclnn
${CANN_INSTALL_DIR}/acllib/include
)
add_subdirectory(ggml-cann/kernels)
list(APPEND CANN_LIBRARIES
ascendcl
nnopbase
opapi
acl_op_compiler
ascendc_kernels
)
set(GGML_HEADERS_CANN "../include/ggml-cann.h")
file(GLOB GGML_SOURCES_CANN "ggml-cann/*.cpp")
list(APPEND GGML_SOURCES_CANN "ggml-cann.cpp")
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
list(APPEND GGML_EXTRA_LIBS_PRIVATE ${CANN_LIBRARIES} )
list(APPEND GGML_EXTRA_INCLUDES ${CANN_INCLUDE_DIRS})
list(APPEND GGML_EXTRA_LIBDIRS ${CANN_INSTALL_DIR}/lib64)
list(APPEND GGML_CDEF_PUBLIC GGML_USE_CANN)
endif()
else()
set(GGML_CANN OFF)
message(WARNING "CANN: Can't find CANN_INSTALL_DIR, do you forget to source set_var.sh. Turning off GGML_CANN")
endif()
if(NOT GGML_CANN)
message(WARNING "CANN: GGML_CANN is turned OFF, see above for details.")
endif()
endif()
function(get_flags CCID CCVER)
set(C_FLAGS "")
set(CXX_FLAGS "")
@ -939,10 +761,8 @@ function(get_flags CCID CCVER)
set(C_FLAGS -Wdouble-promotion)
set(CXX_FLAGS -Wno-array-bounds)
if (NOT GGML_MUSA)
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
list(APPEND CXX_FLAGS -Wno-format-truncation)
endif()
if (CCVER VERSION_GREATER_EQUAL 7.1.0)
list(APPEND CXX_FLAGS -Wno-format-truncation)
endif()
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
list(APPEND CXX_FLAGS -Wextra-semi)
@ -1201,7 +1021,6 @@ elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LW
endif()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512dq)
list(APPEND ARCH_FLAGS -mavx512bw)
endif()
if (GGML_AVX512_VBMI)
@ -1275,7 +1094,7 @@ endif()
# Data types, macros and functions related to controlling CPU affinity and
# some memory allocation are available on Linux through GNU extensions in libc
if (CMAKE_SYSTEM_NAME MATCHES "Linux" OR CMAKE_SYSTEM_NAME MATCHES "Android")
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
add_compile_definitions(_GNU_SOURCE)
endif()
@ -1325,7 +1144,7 @@ add_library(ggml
../include/ggml-backend.h
ggml.c
ggml-alloc.c
ggml-backend.cpp
ggml-backend.c
ggml-quants.c
ggml-quants.h
${GGML_SOURCES_CUDA} ${GGML_HEADERS_CUDA}
@ -1338,34 +1157,24 @@ add_library(ggml
${GGML_SOURCES_ROCM} ${GGML_HEADERS_ROCM}
${GGML_SOURCES_BLAS} ${GGML_HEADERS_BLAS}
${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
${GGML_SOURCES_CANN} ${GGML_HEADERS_CANN}
ggml-aarch64.c ggml-aarch64.h
)
if (EMSCRIPTEN)
set_target_properties(ggml PROPERTIES COMPILE_FLAGS "-msimd128")
endif()
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
target_include_directories(ggml PUBLIC ../include)
target_compile_definitions(ggml PUBLIC ${GGML_CDEF_PUBLIC})
target_include_directories(ggml PUBLIC ../include)
target_include_directories(ggml PRIVATE . ${GGML_EXTRA_INCLUDES})
target_link_directories (ggml PRIVATE ${GGML_EXTRA_LIBDIRS})
target_compile_features (ggml PRIVATE c_std_11) # don't bump
list(APPEND GGML_EXTRA_LIBS_PRIVATE Threads::Threads)
target_link_libraries(ggml PRIVATE Threads::Threads ${GGML_EXTRA_LIBS})
find_library(MATH_LIBRARY m)
if (MATH_LIBRARY)
if (NOT WIN32 OR NOT GGML_SYCL)
list(APPEND GGML_EXTRA_LIBS_PRIVATE m)
endif()
target_link_libraries(ggml PRIVATE ${MATH_LIBRARY})
endif()
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PRIVATE)
list(REMOVE_DUPLICATES GGML_EXTRA_LIBS_PUBLIC)
target_link_libraries(ggml PRIVATE ${GGML_EXTRA_LIBS_PRIVATE} PUBLIC ${GGML_EXTRA_LIBS_PUBLIC})
if (BUILD_SHARED_LIBS)
set_target_properties(ggml PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(ggml PRIVATE GGML_SHARED GGML_BUILD)
endif()

File diff suppressed because it is too large Load Diff

View File

@ -1,39 +0,0 @@
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
#pragma once
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml.h"
// GGML internal header
#ifdef __cplusplus
extern "C" {
#endif
// Quantization
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// GEMV
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
// GEMM
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#ifdef __cplusplus
}
#endif

Some files were not shown because too many files have changed in this diff Show More