mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-02 23:41:28 +02:00
Compare commits
49 Commits
v1.7.2-pre
...
gg/objc
Author | SHA1 | Date | |
---|---|---|---|
b67bdc9430 | |||
5e966f7844 | |||
54005478af | |||
49c389b40a | |||
37c88027e1 | |||
9db070a3c5 | |||
7fd8d9c220 | |||
06e059b8f8 | |||
c9f49d5f9d | |||
f4c1d7df39 | |||
339b8e559c | |||
5f6d6919b4 | |||
8ee767732f | |||
45f1f9144f | |||
53589c8f12 | |||
7ac2f17fac | |||
48862c7b27 | |||
44f7d9f4e3 | |||
fd12302587 | |||
f80bef4630 | |||
161b443514 | |||
ef7fbe1c66 | |||
0879d3599e | |||
2a444dc5bd | |||
45cf1634dc | |||
dcb2922d1d | |||
3c5c751174 | |||
24ad19d0e9 | |||
bd574b05af | |||
7e0eafcb1e | |||
75670ae673 | |||
d4fcdf602b | |||
1bebb1a116 | |||
ee437cde59 | |||
c1506d38cf | |||
c9541741e6 | |||
6a55015dc4 | |||
7e86030d4d | |||
401fbea326 | |||
44d1cbdfe9 | |||
3216efef2e | |||
2c0484ebf7 | |||
3298916e5e | |||
746bf2596f | |||
5f7e094ccb | |||
6266a9f9e5 | |||
d24f981fb2 | |||
01d3bd7d5c | |||
bb12cd9b77 |
13
.github/workflows/build.yml
vendored
13
.github/workflows/build.yml
vendored
@ -549,8 +549,9 @@ jobs:
|
||||
cp models/for-tests-ggml-base.en.bin models/ggml-base.en.bin
|
||||
mkdir models/ggml-base.en-encoder.mlmodelc
|
||||
|
||||
- name: Build objc example
|
||||
run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphonesimulator build
|
||||
# TODO: disabled because it fails for some reason with Github Actions
|
||||
# - name: Build objc example
|
||||
# run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphonesimulator build
|
||||
|
||||
- name: Build swiftui example
|
||||
run: xcodebuild -project examples/whisper.swiftui/whisper.swiftui.xcodeproj -scheme WhisperCppDemo -configuration ${{ matrix.build }} -sdk iphonesimulator build
|
||||
@ -564,12 +565,6 @@ jobs:
|
||||
with:
|
||||
path: whisper
|
||||
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
repository: ggerganov/ggml
|
||||
path: ggml
|
||||
|
||||
- name: Install Java
|
||||
uses: actions/setup-java@v4
|
||||
with:
|
||||
@ -588,7 +583,7 @@ jobs:
|
||||
run: |
|
||||
export PATH_TO_GGML=$PWD/ggml
|
||||
cd whisper/examples/whisper.android
|
||||
./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML
|
||||
./gradlew assembleRelease --no-daemon
|
||||
|
||||
# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
|
||||
# android_java:
|
||||
|
4
.gitignore
vendored
4
.gitignore
vendored
@ -1,5 +1,6 @@
|
||||
*.o
|
||||
*.a
|
||||
*.d
|
||||
.cache/
|
||||
.coreml/
|
||||
.test/
|
||||
@ -19,6 +20,9 @@ build-*/
|
||||
.swiftpm
|
||||
*.metallib
|
||||
|
||||
ggml-metal-embed.metal
|
||||
ggml-metal-embed.metal.tmp
|
||||
|
||||
/main
|
||||
/stream
|
||||
/command
|
||||
|
@ -1,6 +1,6 @@
|
||||
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
|
||||
project("whisper.cpp" C CXX)
|
||||
project("whisper.cpp" VERSION 1.7.1)
|
||||
project("whisper.cpp" VERSION 1.7.2)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(SOVERSION 1)
|
||||
|
261
Makefile
261
Makefile
@ -444,17 +444,17 @@ endif
|
||||
else
|
||||
MK_CFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
MK_CXXFLAGS += -march=rv64gcv -mabi=lp64d
|
||||
endif
|
||||
endif # RISCV
|
||||
|
||||
ifndef GGML_NO_ACCELERATE
|
||||
# Mac OS - include Accelerate framework.
|
||||
# `-framework Accelerate` works both with Apple Silicon and Mac Intel
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS
|
||||
MK_CPPFLAGS += -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE
|
||||
MK_CPPFLAGS += -DACCELERATE_NEW_LAPACK
|
||||
MK_CPPFLAGS += -DACCELERATE_LAPACK_ILP64
|
||||
MK_LDFLAGS += -framework Accelerate
|
||||
OBJ_GGML += ggml/src/ggml-blas.o
|
||||
OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o
|
||||
endif
|
||||
endif # GGML_NO_ACCELERATE
|
||||
|
||||
@ -464,29 +464,38 @@ ifndef GGML_NO_OPENMP
|
||||
MK_CXXFLAGS += -fopenmp
|
||||
endif # GGML_NO_OPENMP
|
||||
|
||||
ifdef WHISPER_COREML
|
||||
MK_CXXFLAGS += -DWHISPER_USE_COREML
|
||||
LDFLAGS += -framework Foundation -framework CoreML
|
||||
|
||||
ifdef WHISPER_COREML_ALLOW_FALLBACK
|
||||
MK_CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK
|
||||
endif
|
||||
endif # WHISPER_COREML
|
||||
|
||||
ifdef GGML_OPENBLAS
|
||||
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas)
|
||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas)
|
||||
MK_LDFLAGS += $(shell pkg-config --libs openblas)
|
||||
OBJ_GGML += ggml/src/ggml-blas.o
|
||||
OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o
|
||||
endif # GGML_OPENBLAS
|
||||
|
||||
ifdef GGML_OPENBLAS64
|
||||
MK_CPPFLAGS += -DGGML_USE_BLAS $(shell pkg-config --cflags-only-I openblas64)
|
||||
MK_CFLAGS += $(shell pkg-config --cflags-only-other openblas64)
|
||||
MK_LDFLAGS += $(shell pkg-config --libs openblas64)
|
||||
OBJ_GGML += ggml/src/ggml-blas.o
|
||||
OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o
|
||||
endif # GGML_OPENBLAS64
|
||||
|
||||
ifdef GGML_BLIS
|
||||
MK_CPPFLAGS += -DGGML_USE_BLAS -I/usr/local/include/blis -I/usr/include/blis
|
||||
MK_LDFLAGS += -lblis -L/usr/local/lib
|
||||
OBJ_GGML += ggml/src/ggml-blas.o
|
||||
OBJ_GGML += ggml/src/ggml-blas/ggml-blas.o
|
||||
endif # GGML_BLIS
|
||||
|
||||
ifdef GGML_RPC
|
||||
MK_CPPFLAGS += -DGGML_USE_RPC
|
||||
OBJ_GGML += ggml/src/ggml-rpc.o
|
||||
OBJ_GGML += ggml/src/ggml-rpc/ggml-rpc.o
|
||||
endif # GGML_RPC
|
||||
|
||||
OBJ_CUDA_TMPL = $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu))
|
||||
@ -513,7 +522,7 @@ ifdef GGML_CUDA
|
||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L$(CUDA_PATH)/lib64/stubs -L/usr/lib/wsl/lib
|
||||
MK_NVCCFLAGS += -use_fast_math
|
||||
|
||||
OBJ_GGML += ggml/src/ggml-cuda.o
|
||||
OBJ_GGML += ggml/src/ggml-cuda/ggml-cuda.o
|
||||
OBJ_GGML += $(patsubst %.cu,%.o,$(wildcard ggml/src/ggml-cuda/*.cu))
|
||||
OBJ_GGML += $(OBJ_CUDA_TMPL)
|
||||
ifdef WHISPER_FATAL_WARNINGS
|
||||
@ -615,11 +624,11 @@ ggml/src/ggml-cuda/%.o: \
|
||||
ggml/src/ggml-cuda/common.cuh
|
||||
$(NVCC_COMPILE)
|
||||
|
||||
ggml/src/ggml-cuda.o: \
|
||||
ggml/src/ggml-cuda.cu \
|
||||
ggml/src/ggml-cuda/ggml-cuda.o: \
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu \
|
||||
ggml/include/ggml-cuda.h \
|
||||
ggml/include/ggml.h \
|
||||
ggml/include/ggml-backend.h \
|
||||
ggml/include/ggml-cuda.h \
|
||||
ggml/src/ggml-backend-impl.h \
|
||||
ggml/src/ggml-common.h \
|
||||
$(wildcard ggml/src/ggml-cuda/*.cuh)
|
||||
@ -742,50 +751,43 @@ endif # GGML_HIPBLAS
|
||||
ifdef GGML_METAL
|
||||
MK_CPPFLAGS += -DGGML_USE_METAL
|
||||
MK_LDFLAGS += -framework Foundation -framework Metal -framework MetalKit
|
||||
OBJ_GGML += ggml/src/ggml-metal.o
|
||||
OBJ_GGML += ggml/src/ggml-metal/ggml-metal.o
|
||||
ifdef GGML_METAL_NDEBUG
|
||||
MK_CPPFLAGS += -DGGML_METAL_NDEBUG
|
||||
endif
|
||||
|
||||
ifdef GGML_METAL_EMBED_LIBRARY
|
||||
MK_CPPFLAGS += -DGGML_METAL_EMBED_LIBRARY
|
||||
OBJ_GGML += ggml/src/ggml-metal-embed.o
|
||||
OBJ_GGML += ggml/src/ggml-metal/ggml-metal-embed.o
|
||||
endif
|
||||
endif # GGML_METAL
|
||||
|
||||
ifdef WHISPER_COREML
|
||||
MK_CXXFLAGS += -DWHISPER_USE_COREML
|
||||
LDFLAGS += -framework Foundation -framework CoreML
|
||||
|
||||
ifdef WHISPER_COREML_ALLOW_FALLBACK
|
||||
MK_CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK
|
||||
endif
|
||||
endif
|
||||
|
||||
# ===
|
||||
|
||||
ifdef GGML_METAL
|
||||
ggml/src/ggml-metal.o: \
|
||||
ggml/src/ggml-metal.m \
|
||||
ggml/src/ggml-metal/ggml-metal.o: \
|
||||
ggml/src/ggml-metal/ggml-metal.m \
|
||||
ggml/src/ggml-metal/ggml-metal-impl.h \
|
||||
ggml/include/ggml-metal.h \
|
||||
ggml/include/ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
|
||||
ifdef GGML_METAL_EMBED_LIBRARY
|
||||
ggml/src/ggml-metal-embed.o: \
|
||||
ggml/src/ggml-metal.metal \
|
||||
ggml/src/ggml-metal/ggml-metal-embed.o: \
|
||||
ggml/src/ggml-metal/ggml-metal.metal \
|
||||
ggml/src/ggml-metal/ggml-metal-impl.h \
|
||||
ggml/src/ggml-common.h
|
||||
@echo "Embedding Metal library"
|
||||
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
|
||||
$(eval TEMP_ASSEMBLY=$(shell mktemp))
|
||||
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
|
||||
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
|
||||
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
|
||||
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
|
||||
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
|
||||
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
|
||||
@$(AS) $(TEMP_ASSEMBLY) -o $@
|
||||
@rm -f ${TEMP_ASSEMBLY}
|
||||
@sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
|
||||
@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
|
||||
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
|
||||
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
|
||||
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
|
||||
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
|
||||
@rmdir ${TEMP_ASSEMBLY}
|
||||
endif
|
||||
endif # GGML_METAL
|
||||
|
||||
@ -801,11 +803,17 @@ endif
|
||||
|
||||
OBJ_GGML += \
|
||||
ggml/src/ggml.o \
|
||||
ggml/src/ggml-cpu.o \
|
||||
ggml/src/ggml-aarch64.o \
|
||||
ggml/src/ggml-alloc.o \
|
||||
ggml/src/ggml-backend.o \
|
||||
ggml/src/ggml-backend-reg.o \
|
||||
ggml/src/ggml-opt.o \
|
||||
ggml/src/ggml-quants.o \
|
||||
ggml/src/ggml-aarch64.o
|
||||
ggml/src/ggml-threading.o \
|
||||
ggml/src/ggml-cpu/ggml-cpu.o \
|
||||
ggml/src/ggml-cpu/ggml-cpu-cpp.o \
|
||||
ggml/src/ggml-cpu/ggml-cpu-aarch64.o \
|
||||
ggml/src/ggml-cpu/ggml-cpu-quants.o
|
||||
|
||||
OBJ_WHISPER += \
|
||||
src/whisper.o
|
||||
@ -910,114 +918,64 @@ endif
|
||||
# Build libraries
|
||||
#
|
||||
|
||||
# ggml
|
||||
LIB_GGML = libggml.so
|
||||
LIB_GGML_S = libggml.a
|
||||
|
||||
ggml/src/ggml.o: \
|
||||
ggml/src/ggml.c \
|
||||
ggml/include/ggml.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
LIB_LLAMA = libllama.so
|
||||
LIB_LLAMA_S = libllama.a
|
||||
|
||||
ggml/src/ggml-cpu.o: \
|
||||
ggml/src/ggml-cpu.c \
|
||||
ggml/include/ggml.h \
|
||||
ggml/src/ggml-common.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
LIB_COMMON = libcommon.so
|
||||
LIB_COMMON_S = libcommon.a
|
||||
|
||||
ggml/src/ggml-alloc.o: \
|
||||
ggml/src/ggml-alloc.c \
|
||||
ggml/include/ggml.h \
|
||||
ggml/include/ggml-alloc.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
LIB_COMMON_SDL = libcommon-sdl.so
|
||||
LIB_COMMON_SDL_S = libcommon-sdl.a
|
||||
|
||||
ggml/src/ggml-backend.o: \
|
||||
ggml/src/ggml-backend.cpp \
|
||||
ggml/include/ggml.h \
|
||||
ggml/include/ggml-backend.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
# Targets
|
||||
BUILD_TARGETS += $(LIB_GGML) $(LIB_GGML_S) $(LIB_LLAMA) $(LIB_LLAMA_S) $(LIB_COMMON) $(LIB_COMMON_S)
|
||||
|
||||
ggml/src/ggml-quants.o: \
|
||||
ggml/src/ggml-quants.c \
|
||||
ggml/include/ggml.h \
|
||||
ggml/src/ggml-quants.h \
|
||||
ggml/src/ggml-common.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
# Dependency files
|
||||
DEP_FILES = $(OBJ_GGML:.o=.d) $(OBJ_LLAMA:.o=.d) $(OBJ_COMMON:.o=.d)
|
||||
|
||||
ggml/src/ggml-aarch64.o: \
|
||||
ggml/src/ggml-aarch64.c \
|
||||
ggml/include/ggml.h \
|
||||
ggml/src/ggml-aarch64.h \
|
||||
ggml/src/ggml-common.h
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
# Default target
|
||||
all: $(BUILD_TARGETS)
|
||||
|
||||
ggml/src/ggml-blas.o: \
|
||||
ggml/src/ggml-blas.cpp \
|
||||
ggml/include/ggml-blas.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
ifdef GGML_LLAMAFILE
|
||||
ggml/src/sgemm.o: \
|
||||
ggml/src/sgemm.cpp \
|
||||
ggml/src/sgemm.h \
|
||||
ggml/include/ggml.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # GGML_LLAMAFILE
|
||||
|
||||
ifdef GGML_RPC
|
||||
ggml/src/ggml-rpc.o: \
|
||||
ggml/src/ggml-rpc.cpp \
|
||||
ggml/include/ggml-rpc.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
endif # GGML_RPC
|
||||
|
||||
$(LIB_GGML): \
|
||||
$(OBJ_GGML)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(LIB_GGML_S): \
|
||||
$(OBJ_GGML)
|
||||
ar rcs $(LIB_GGML_S) $^
|
||||
|
||||
# whisper
|
||||
|
||||
src/whisper.o: \
|
||||
src/whisper.cpp \
|
||||
include/whisper.h \
|
||||
# Note: need this exception because `ggml-cpu.c` and `ggml-cpu.cpp` both produce the same obj/dep files
|
||||
# g++ -M -I ./ggml/include/ -I ./ggml/src ggml/src/ggml-cpu/ggml-cpu.cpp | grep ggml
|
||||
ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
|
||||
ggml/src/ggml-cpu/ggml-cpu.cpp \
|
||||
ggml/include/ggml-backend.h \
|
||||
ggml/include/ggml.h \
|
||||
ggml/include/ggml-alloc.h \
|
||||
ggml/include/ggml-backend.h \
|
||||
ggml/include/ggml-cuda.h \
|
||||
ggml/include/ggml-metal.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
ggml/src/ggml-backend-impl.h \
|
||||
ggml/include/ggml-cpu.h \
|
||||
ggml/src/ggml-impl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
$(LIB_WHISPER): \
|
||||
$(OBJ_WHISPER) \
|
||||
$(LIB_GGML)
|
||||
# Rules for building object files
|
||||
ggml/%.o: ggml/%.c
|
||||
$(CC) $(CFLAGS) -MMD -c $< -o $@
|
||||
|
||||
ggml/%.o: ggml/%.cpp
|
||||
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
|
||||
|
||||
src/%.o: src/%.cpp
|
||||
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
|
||||
|
||||
examples/%.o: examples/%.cpp
|
||||
$(CXX) $(CXXFLAGS) -MMD -c $< -o $@
|
||||
|
||||
# Rules for building libraries
|
||||
$(LIB_GGML): $(OBJ_GGML)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(LIB_WHISPER_S): \
|
||||
$(OBJ_WHISPER) \
|
||||
$(OBJ_GGML)
|
||||
ar rcs $(LIB_WHISPER_S) $^
|
||||
$(LIB_GGML_S): $(OBJ_GGML)
|
||||
ar rcs $(LIB_GGML_S) $^
|
||||
|
||||
# common
|
||||
|
||||
examples/common.o: \
|
||||
examples/common.cpp \
|
||||
examples/common.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
examples/common-ggml.o: \
|
||||
examples/common-ggml.cpp \
|
||||
examples/common-ggml.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
|
||||
$(LIB_COMMON): \
|
||||
$(OBJ_COMMON)
|
||||
$(LIB_LLAMA): $(OBJ_LLAMA) $(LIB_GGML)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(LIB_COMMON_S): \
|
||||
$(OBJ_COMMON)
|
||||
ar rcs $(LIB_COMMON_S) $^
|
||||
$(LIB_LLAMA_S): $(OBJ_LLAMA)
|
||||
ar rcs $(LIB_LLAMA_S) $^
|
||||
|
||||
# common-sdl
|
||||
|
||||
@ -1029,34 +987,21 @@ examples/common-sdl.o: \
|
||||
examples/common-sdl.h
|
||||
$(CXX) $(CXXFLAGS) $(CFLAGS_SDL) -c $< -o $@
|
||||
|
||||
$(LIB_COMMON_SDL): \
|
||||
$(OBJ_SDL)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS) $(LDFLAGS_SDL)
|
||||
$(LIB_COMMON): $(OBJ_COMMON) $(LIB_LLAMA) $(LIB_GGML)
|
||||
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
|
||||
|
||||
$(LIB_COMMON_SDL_S): \
|
||||
$(OBJ_SDL)
|
||||
ar rcs $(LIB_COMMON_SDL_S) $^
|
||||
$(LIB_COMMON_S): $(OBJ_COMMON)
|
||||
ar rcs $(LIB_COMMON_S) $^
|
||||
|
||||
# Include dependency files
|
||||
-include $(DEP_FILES)
|
||||
|
||||
# Clean rule
|
||||
clean:
|
||||
rm -vrf *.dot $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
rm -rvf src/*.o
|
||||
rm -rvf src/coreml/*.o
|
||||
rm -rvf tests/*.o
|
||||
rm -rvf examples/*.o
|
||||
rm -rvf *.a
|
||||
rm -rvf *.dll
|
||||
rm -rvf *.so
|
||||
rm -rvf *.dot
|
||||
rm -rvf ggml/*.a
|
||||
rm -rvf ggml/*.dll
|
||||
rm -rvf ggml/*.so
|
||||
rm -vrf ggml/src/*.o
|
||||
rm -vrf ggml/src/ggml-metal-embed.metal
|
||||
rm -vrf ggml/src/ggml-cuda/*.o
|
||||
rm -vrf ggml/src/ggml-cuda/template-instances/*.o
|
||||
rm -rvf $(BUILD_TARGETS)
|
||||
rm -rvf $(TEST_TARGETS)
|
||||
find examples -type f -name "*.o" -delete
|
||||
rm -vrf $(BUILD_TARGETS) $(TEST_TARGETS)
|
||||
rm -rvf *.a *.dll *.so *.dot
|
||||
find ggml src tests examples -type f -name "*.o" -delete
|
||||
find ggml src tests examples -type f -name "*.d" -delete
|
||||
|
||||
#
|
||||
# Examples
|
||||
|
@ -28,7 +28,7 @@ let package = Package(
|
||||
"tests",
|
||||
"CMakeLists.txt",
|
||||
"Makefile",
|
||||
"ggml/src/ggml-metal-embed.metal"
|
||||
"ggml/src/ggml-metal/ggml-metal-embed.metal"
|
||||
],
|
||||
sources: [
|
||||
"ggml/src/ggml.c",
|
||||
@ -36,16 +36,22 @@ let package = Package(
|
||||
"ggml/src/ggml-aarch64.c",
|
||||
"ggml/src/ggml-alloc.c",
|
||||
"ggml/src/ggml-backend.cpp",
|
||||
"ggml/src/ggml-cpu.c",
|
||||
"ggml/src/ggml-backend-reg.cpp",
|
||||
"ggml/src/ggml-cpu/ggml-cpu.c",
|
||||
"ggml/src/ggml-cpu/ggml-cpu.cpp",
|
||||
"ggml/src/ggml-cpu/ggml-cpu-aarch64.c",
|
||||
"ggml/src/ggml-cpu/ggml-cpu-quants.c",
|
||||
"ggml/src/ggml-quants.c",
|
||||
"ggml/src/ggml-metal.m"
|
||||
"ggml/src/ggml-threading.cpp",
|
||||
"ggml/src/ggml-metal/ggml-metal.m"
|
||||
],
|
||||
resources: [.process("ggml/src/ggml-metal.metal")],
|
||||
resources: [.process("ggml/src/ggml-metal/ggml-metal.metal")],
|
||||
publicHeadersPath: "spm-headers",
|
||||
cSettings: [
|
||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||
.define("GGML_USE_ACCELERATE"),
|
||||
.unsafeFlags(["-fno-objc-arc"]),
|
||||
.headerSearchPath("ggml/src"),
|
||||
.define("GGML_USE_ACCELERATE"),
|
||||
.define("GGML_USE_METAL")
|
||||
// NOTE: NEW_LAPACK will required iOS version 16.4+
|
||||
// We should consider add this in the future when we drop support for iOS 14
|
||||
|
@ -7,7 +7,7 @@
|
||||
[](https://conan.io/center/whisper-cpp)
|
||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||
|
||||
Stable: [v1.7.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
Stable: [v1.7.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.2) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.7.1",
|
||||
"version": "1.7.2",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
|
@ -137,7 +137,7 @@ if (WHISPER_SDL2)
|
||||
set_target_properties(lsp PROPERTIES FOLDER "examples")
|
||||
if (GGML_SYCL)
|
||||
add_subdirectory(sycl)
|
||||
set_target_properties(sycl PROPERTIES FOLDER "examples")
|
||||
set_target_properties(ls-sycl-device PROPERTIES FOLDER "examples")
|
||||
endif()
|
||||
endif (WHISPER_SDL2)
|
||||
endif()
|
||||
|
@ -5,5 +5,5 @@
|
||||
set(TARGET ls-sycl-device)
|
||||
add_executable(${TARGET} ls-sycl-device.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
@ -7,13 +7,16 @@ cd build
|
||||
source /opt/intel/oneapi/setvars.sh
|
||||
|
||||
#for FP16
|
||||
#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference
|
||||
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference
|
||||
|
||||
#for FP32
|
||||
cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#for other features from the examples, e.g. stream and talk link with SDL2:
|
||||
#cmake .. -DGGML_SYCL=ON -DWHISPER_SDL2=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
|
||||
|
||||
#build example/main only
|
||||
#cmake --build . --config Release --target main
|
||||
|
||||
#build all binary
|
||||
cmake --build . --config Release -v
|
||||
cmake --build . --config Release -v
|
||||
|
@ -179,6 +179,7 @@ enum llm_arch {
|
||||
LLM_ARCH_COMMAND_R,
|
||||
LLM_ARCH_DBRX,
|
||||
LLM_ARCH_OLMO,
|
||||
LLM_ARCH_OLMO_1124,
|
||||
LLM_ARCH_OLMOE,
|
||||
LLM_ARCH_OPENELM,
|
||||
LLM_ARCH_ARCTIC,
|
||||
@ -232,6 +233,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||
{ LLM_ARCH_DBRX, "dbrx" },
|
||||
{ LLM_ARCH_OLMO, "olmo" },
|
||||
{ LLM_ARCH_OLMO_1124, "olmo_1124" },
|
||||
{ LLM_ARCH_OLMOE, "olmoe" },
|
||||
{ LLM_ARCH_OPENELM, "openelm" },
|
||||
{ LLM_ARCH_ARCTIC, "arctic" },
|
||||
@ -1207,6 +1209,25 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_OLMO_1124,
|
||||
{
|
||||
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
||||
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
||||
{ LLM_TENSOR_OUTPUT, "output" },
|
||||
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
||||
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
||||
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
||||
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
||||
{ LLM_TENSOR_ATTN_POST_NORM, "blk.%d.post_attention_norm" },
|
||||
{ LLM_TENSOR_ATTN_Q_NORM, "blk.%d.attn_q_norm" },
|
||||
{ LLM_TENSOR_ATTN_K_NORM, "blk.%d.attn_k_norm" },
|
||||
{ LLM_TENSOR_FFN_POST_NORM, "blk.%d.post_ffw_norm" },
|
||||
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
||||
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
||||
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
||||
},
|
||||
},
|
||||
{
|
||||
LLM_ARCH_OLMOE,
|
||||
{
|
||||
@ -2907,9 +2928,15 @@ struct llama_model {
|
||||
// for quantize-stats only
|
||||
std::vector<std::pair<std::string, struct ggml_tensor *>> tensors_by_name;
|
||||
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_load_us = 0;
|
||||
int64_t t_start_us = 0;
|
||||
|
||||
// total number of parameters in the model
|
||||
uint64_t n_elements = 0;
|
||||
|
||||
// total size of all the tensors in the model in bytes
|
||||
size_t n_bytes = 0;
|
||||
|
||||
// keep track of loaded lora adapters
|
||||
std::set<struct llama_lora_adapter *> lora_adapters;
|
||||
|
||||
@ -3454,21 +3481,13 @@ static bool llama_kv_cache_init(
|
||||
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(i) + hparams.n_embd_k_s();
|
||||
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(i) + hparams.n_embd_v_s();
|
||||
|
||||
const llama_model::buft_list_t * buft_list;
|
||||
ggml_backend_buffer_type_t buft;
|
||||
if (offload) {
|
||||
buft_list = model.dev_layer.at(i).buft_list;
|
||||
auto * dev = model.dev_layer.at(i).dev;
|
||||
buft = ggml_backend_dev_buffer_type(dev);
|
||||
} else {
|
||||
buft_list = &model.cpu_buft_list;
|
||||
buft = ggml_backend_cpu_buffer_type();
|
||||
}
|
||||
ggml_backend_buffer_type_t buft = select_buft(*buft_list,
|
||||
[&](ggml_context * ctx) {
|
||||
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
|
||||
if (hparams.rope_type == LLAMA_ROPE_TYPE_NONE) {
|
||||
return k;
|
||||
}
|
||||
ggml_tensor * p = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1);
|
||||
return ggml_rope(ctx, k, p, hparams.n_rot, hparams.rope_type);
|
||||
});
|
||||
ggml_context * ctx = ctx_for_buft(buft);
|
||||
|
||||
if (!ctx) {
|
||||
@ -4275,8 +4294,8 @@ struct llama_model_loader {
|
||||
int n_tensors = 0;
|
||||
int n_created = 0;
|
||||
|
||||
int64_t n_elements = 0;
|
||||
size_t n_bytes = 0;
|
||||
uint64_t n_elements = 0;
|
||||
size_t n_bytes = 0;
|
||||
|
||||
bool use_mmap = false;
|
||||
bool check_tensors;
|
||||
@ -5344,6 +5363,11 @@ static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
||||
}
|
||||
}
|
||||
|
||||
static void llm_load_stats(llama_model_loader & ml, llama_model & model) {
|
||||
model.n_elements = ml.n_elements;
|
||||
model.n_bytes = ml.n_bytes;
|
||||
}
|
||||
|
||||
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
||||
model.arch = ml.get_arch();
|
||||
if (model.arch == LLM_ARCH_UNKNOWN) {
|
||||
@ -5874,6 +5898,17 @@ static void llm_load_hparams(
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_OLMO_1124:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
|
||||
switch (hparams.n_layer) {
|
||||
case 16: model.type = e_model::MODEL_1B; break;
|
||||
case 32: model.type = e_model::MODEL_7B; break;
|
||||
case 40: model.type = e_model::MODEL_13B; break;
|
||||
default: model.type = e_model::MODEL_UNKNOWN;
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_OLMOE:
|
||||
{
|
||||
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
||||
@ -7254,7 +7289,7 @@ static llama_model::buft_list_t make_cpu_buft_list(llama_model & model) {
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev);
|
||||
auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t)
|
||||
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_cpu_get_extra_bufts");
|
||||
ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts");
|
||||
if (ggml_backend_dev_get_extra_bufts_fn) {
|
||||
ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev);
|
||||
while (extra_bufts && *extra_bufts) {
|
||||
@ -7521,7 +7556,7 @@ static bool llm_load_tensors(
|
||||
|
||||
// avoid using a host buffer when using mmap
|
||||
auto * buft_dev = ggml_backend_buft_get_device(buft);
|
||||
if (ml.use_mmap && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||
if (ml.use_mmap && buft_dev && buft == ggml_backend_dev_host_buffer_type(buft_dev)) {
|
||||
auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
buft = ggml_backend_dev_buffer_type(cpu_dev);
|
||||
}
|
||||
@ -8556,6 +8591,31 @@ static bool llm_load_tensors(
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_OLMO_1124:
|
||||
{
|
||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
// output
|
||||
model.output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||
model.output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
||||
|
||||
for (int i = 0; i < n_layer; ++i) {
|
||||
auto & layer = model.layers[i];
|
||||
|
||||
layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, 0);
|
||||
layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
|
||||
layer.attn_q_norm = create_tensor(tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_k_norm = create_tensor(tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {n_embd}, 0);
|
||||
layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
|
||||
layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0);
|
||||
layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, 0);
|
||||
layer.ffn_post_norm = create_tensor(tn(LLM_TENSOR_FFN_POST_NORM, "weight", i), {n_embd}, 0);
|
||||
}
|
||||
} break;
|
||||
case LLM_ARCH_OLMOE:
|
||||
{
|
||||
model.tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
||||
@ -9128,6 +9188,10 @@ static bool llm_load_tensors(
|
||||
|
||||
// check if it is possible to use buffer_from_host_ptr with this buffer type
|
||||
ggml_backend_dev_t dev = ggml_backend_buft_get_device(buft);
|
||||
if (!dev) {
|
||||
// FIXME: workaround for CPU backend buft having a NULL device
|
||||
dev = ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0);
|
||||
}
|
||||
ggml_backend_dev_props props;
|
||||
ggml_backend_dev_get_props(dev, &props);
|
||||
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
|
||||
@ -9252,6 +9316,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||
throw std::runtime_error("error loading model vocabulary: " + std::string(e.what()));
|
||||
}
|
||||
|
||||
llm_load_stats(ml, model);
|
||||
llm_load_print_meta(ml, model);
|
||||
|
||||
if (model.vocab.type != LLAMA_VOCAB_TYPE_NONE &&
|
||||
@ -14416,6 +14481,130 @@ struct llm_build_context {
|
||||
return gf;
|
||||
}
|
||||
|
||||
struct ggml_cgraph * build_olmo_1124() {
|
||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, llama_model_max_nodes(model), false);
|
||||
|
||||
// mutable variable, needed during the last layer of the computation to skip unused tokens
|
||||
int32_t n_tokens = this->n_tokens;
|
||||
|
||||
const int64_t n_embd_head = hparams.n_embd_head_v;
|
||||
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
||||
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL;
|
||||
|
||||
inpL = llm_build_inp_embd(ctx0, lctx, hparams, ubatch, model.tok_embd, cb);
|
||||
|
||||
// inp_pos - contains the positions
|
||||
struct ggml_tensor * inp_pos = build_inp_pos();
|
||||
|
||||
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
||||
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
|
||||
|
||||
for (int il = 0; il < n_layer; ++il) {
|
||||
struct ggml_tensor * inpSA = inpL;
|
||||
|
||||
cur = inpL;
|
||||
|
||||
// self_attention
|
||||
{
|
||||
// compute Q and K and RoPE them
|
||||
struct ggml_tensor * Qcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wq, cur);
|
||||
cb(Qcur, "Qcur", il);
|
||||
|
||||
struct ggml_tensor * Kcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wk, cur);
|
||||
cb(Kcur, "Kcur", il);
|
||||
|
||||
struct ggml_tensor * Vcur = llm_build_lora_mm(lctx, ctx0, model.layers[il].wv, cur);
|
||||
cb(Vcur, "Vcur", il);
|
||||
|
||||
Qcur = llm_build_norm(ctx0, Qcur, hparams, model.layers[il].attn_q_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(Qcur, "Qcur_normed", il);
|
||||
|
||||
Kcur = llm_build_norm(ctx0, Kcur, hparams, model.layers[il].attn_k_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(Kcur, "Kcur_normed", il);
|
||||
|
||||
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
||||
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
||||
|
||||
Qcur = ggml_rope_ext(
|
||||
ctx0, Qcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Qcur, "Qcur_rope", il);
|
||||
|
||||
Kcur = ggml_rope_ext(
|
||||
ctx0, Kcur, inp_pos, nullptr,
|
||||
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
|
||||
ext_factor, attn_factor, beta_fast, beta_slow
|
||||
);
|
||||
cb(Kcur, "Kcur_rope", il);
|
||||
|
||||
cur = llm_build_kv(ctx0, lctx, kv_self, gf,
|
||||
model.layers[il].wo, NULL,
|
||||
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
|
||||
}
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.layers[il].attn_post_norm, NULL,
|
||||
LLM_NORM_RMS, cb, il);
|
||||
cb(cur, "attn_post_norm", il);
|
||||
|
||||
if (il == n_layer - 1) {
|
||||
// skip computing output for unused tokens
|
||||
struct ggml_tensor * inp_out_ids = build_inp_out_ids();
|
||||
n_tokens = n_outputs;
|
||||
cur = ggml_get_rows(ctx0, cur, inp_out_ids);
|
||||
inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
||||
cb(ffn_inp, "ffn_inp", il);
|
||||
|
||||
// feed-forward network
|
||||
cur = llm_build_ffn(ctx0, lctx, ffn_inp,
|
||||
model.layers[il].ffn_up, NULL, NULL,
|
||||
model.layers[il].ffn_gate, NULL, NULL,
|
||||
model.layers[il].ffn_down, NULL, NULL,
|
||||
NULL,
|
||||
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.layers[il].ffn_post_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "ffn_post_norm", -1);
|
||||
|
||||
cur = ggml_add(ctx0, cur, ffn_inp);
|
||||
cb(cur, "ffn_out", il);
|
||||
|
||||
cur = lctx.cvec.apply_to(ctx0, cur, il);
|
||||
cb(cur, "l_out", il);
|
||||
|
||||
// input for next layer
|
||||
inpL = cur;
|
||||
}
|
||||
|
||||
cur = inpL;
|
||||
|
||||
cur = llm_build_norm(ctx0, cur, hparams,
|
||||
model.output_norm, NULL,
|
||||
LLM_NORM_RMS, cb, -1);
|
||||
cb(cur, "result_norm", -1);
|
||||
|
||||
// lm_head
|
||||
cur = llm_build_lora_mm(lctx, ctx0, model.output, cur);
|
||||
cb(cur, "result_output", -1);
|
||||
|
||||
ggml_build_forward_expand(gf, cur);
|
||||
|
||||
return gf;
|
||||
}
|
||||
|
||||
// based on the build_qwen2moe() function, changes:
|
||||
// * removed shared experts
|
||||
// * removed bias
|
||||
@ -16608,6 +16797,10 @@ static struct ggml_cgraph * llama_build_graph(
|
||||
{
|
||||
result = llm.build_olmo();
|
||||
} break;
|
||||
case LLM_ARCH_OLMO_1124:
|
||||
{
|
||||
result = llm.build_olmo_1124();
|
||||
} break;
|
||||
case LLM_ARCH_OLMOE:
|
||||
{
|
||||
result = llm.build_olmoe();
|
||||
@ -18020,7 +18213,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
||||
|
||||
// apply K-shift if needed
|
||||
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
||||
if (lctx.model.arch == LLM_ARCH_DEEPSEEK2) { // not supported due to MLA
|
||||
if (!llama_kv_cache_can_shift(&lctx)) {
|
||||
GGML_ABORT("Deepseek2 does not support K-shift");
|
||||
}
|
||||
|
||||
@ -18597,6 +18790,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||
llama_model model;
|
||||
llm_load_arch(ml, model);
|
||||
llm_load_hparams(ml, model);
|
||||
llm_load_stats(ml, model);
|
||||
|
||||
struct quantize_state_internal qs(model, params);
|
||||
|
||||
@ -19876,6 +20070,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
||||
case LLM_ARCH_QWEN:
|
||||
case LLM_ARCH_QWEN2:
|
||||
case LLM_ARCH_QWEN2MOE:
|
||||
case LLM_ARCH_OLMO_1124:
|
||||
case LLM_ARCH_OLMOE:
|
||||
case LLM_ARCH_PHI2:
|
||||
case LLM_ARCH_PHI3:
|
||||
@ -19949,19 +20144,11 @@ int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t bu
|
||||
}
|
||||
|
||||
uint64_t llama_model_size(const struct llama_model * model) {
|
||||
uint64_t size = 0;
|
||||
for (const auto & it : model->tensors_by_name) {
|
||||
size += ggml_nbytes(it.second);
|
||||
}
|
||||
return size;
|
||||
return model->n_bytes;
|
||||
}
|
||||
|
||||
uint64_t llama_model_n_params(const struct llama_model * model) {
|
||||
uint64_t nparams = 0;
|
||||
for (const auto & it : model->tensors_by_name) {
|
||||
nparams += ggml_nelements(it.second);
|
||||
}
|
||||
return nparams;
|
||||
return model->n_elements;
|
||||
}
|
||||
|
||||
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
||||
@ -20275,6 +20462,10 @@ void llama_kv_cache_update(struct llama_context * ctx) {
|
||||
llama_kv_cache_update_internal(*ctx);
|
||||
}
|
||||
|
||||
bool llama_kv_cache_can_shift(struct llama_context * ctx) {
|
||||
return ctx->model.arch != LLM_ARCH_DEEPSEEK2; // not supported due to MLA
|
||||
}
|
||||
|
||||
// deprecated
|
||||
size_t llama_get_state_size(struct llama_context * ctx) {
|
||||
return llama_state_get_size(ctx);
|
||||
@ -22021,7 +22212,6 @@ const char * llama_print_system_info(void) {
|
||||
s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
|
||||
s += "RISCV_VECT = " + std::to_string(ggml_cpu_has_riscv_v()) + " | ";
|
||||
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
|
||||
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
|
||||
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
|
||||
s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
|
||||
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
|
||||
@ -22067,28 +22257,6 @@ void llama_perf_context_reset(struct llama_context * ctx) {
|
||||
ctx->t_p_eval_us = ctx->n_p_eval = 0;
|
||||
}
|
||||
|
||||
void llama_perf_dump_yaml(FILE * stream, const llama_context * ctx) {
|
||||
fprintf(stream, "\n");
|
||||
fprintf(stream, "###########\n");
|
||||
fprintf(stream, "# Timings #\n");
|
||||
fprintf(stream, "###########\n");
|
||||
fprintf(stream, "\n");
|
||||
|
||||
fprintf(stream, "mst_eval: %.2f # ms / token during generation\n",
|
||||
1.0e-3 * ctx->t_eval_us / ctx->n_eval);
|
||||
fprintf(stream, "mst_p_eval: %.2f # ms / token during prompt processing\n",
|
||||
1.0e-3 * ctx->t_p_eval_us / ctx->n_p_eval);
|
||||
fprintf(stream, "n_eval: %d # number of tokens generated (excluding the first one)\n", ctx->n_eval);
|
||||
fprintf(stream, "n_p_eval: %d # number of tokens processed in batches at the beginning\n", ctx->n_p_eval);
|
||||
fprintf(stream, "t_eval_us: %" PRId64 " # total microseconds spent generating tokens\n", ctx->t_eval_us);
|
||||
fprintf(stream, "t_load_us: %" PRId64 " # total microseconds spent loading the model\n", ctx->t_load_us);
|
||||
fprintf(stream, "t_p_eval_us: %" PRId64 " # total microseconds spent prompt processing\n", ctx->t_p_eval_us);
|
||||
fprintf(stream, "ts_eval: %.2f # tokens / second during generation\n",
|
||||
1.0e6 * ctx->n_eval / ctx->t_eval_us);
|
||||
fprintf(stream, "ts_p_eval: %.2f # tokens / second during prompt processing\n",
|
||||
1.0e6 * ctx->n_p_eval / ctx->t_p_eval_us);
|
||||
}
|
||||
|
||||
// For internal test use
|
||||
const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
|
||||
struct llama_context * ctx
|
||||
|
@ -667,6 +667,9 @@ extern "C" {
|
||||
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
|
||||
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
|
||||
|
||||
// Check if the context supports KV cache shifting
|
||||
LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
|
||||
|
||||
//
|
||||
// State / sessions
|
||||
//
|
||||
@ -1244,8 +1247,6 @@ extern "C" {
|
||||
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
|
||||
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
|
||||
|
||||
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -19,11 +19,16 @@ if (NOT GGML_HOME)
|
||||
SOURCE_FILES
|
||||
${SOURCE_FILES}
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-backend-reg.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-threading.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-quants.c
|
||||
)
|
||||
endif()
|
||||
|
||||
|
@ -25,6 +25,11 @@
|
||||
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
|
||||
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
|
||||
18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */ = {isa = PBXBuildFile; fileRef = 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */; };
|
||||
18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */; };
|
||||
18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */; };
|
||||
18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */; };
|
||||
18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */; };
|
||||
18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */; };
|
||||
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
|
||||
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
|
||||
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */; };
|
||||
@ -50,8 +55,8 @@
|
||||
18133C7F2C64E342005CEAAC /* ggml-aarch64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-aarch64.c"; path = "../../../ggml/src/ggml-aarch64.c"; sourceTree = "<group>"; };
|
||||
184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
|
||||
184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
|
||||
1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; };
|
||||
1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal.metal"; sourceTree = "<group>"; };
|
||||
1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal/ggml-metal.m"; sourceTree = "<group>"; };
|
||||
1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal/ggml-metal.metal"; sourceTree = "<group>"; };
|
||||
18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
|
||||
18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
|
||||
@ -77,8 +82,17 @@
|
||||
18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
|
||||
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
|
||||
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
|
||||
18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu.c"; sourceTree = "<group>"; };
|
||||
18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
|
||||
18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
|
||||
18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
|
||||
18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-threading.cpp"; path = "../../../ggml/src/ggml-threading.cpp"; sourceTree = "<group>"; };
|
||||
18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-cpu.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.cpp"; sourceTree = "<group>"; };
|
||||
18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-aarch64.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.h"; sourceTree = "<group>"; };
|
||||
18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-aarch64.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.c"; sourceTree = "<group>"; };
|
||||
18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-impl.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-impl.h"; sourceTree = "<group>"; };
|
||||
18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-quants.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.h"; sourceTree = "<group>"; };
|
||||
18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-quants.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.c"; sourceTree = "<group>"; };
|
||||
18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-backend-reg.cpp"; path = "../../../ggml/src/ggml-backend-reg.cpp"; sourceTree = "<group>"; };
|
||||
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
|
||||
7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = "<group>"; };
|
||||
7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = "<group>"; };
|
||||
@ -118,6 +132,15 @@
|
||||
18627C7829052BDF00BD2A04 /* whisper.objc */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
|
||||
18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */,
|
||||
18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */,
|
||||
18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */,
|
||||
18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */,
|
||||
18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */,
|
||||
18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */,
|
||||
18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */,
|
||||
18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */,
|
||||
18E864AA2CE73C580094B8B3 /* ggml-cpu.h */,
|
||||
18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */,
|
||||
18133C7F2C64E342005CEAAC /* ggml-aarch64.c */,
|
||||
@ -252,11 +275,16 @@
|
||||
18627C9629052C5800BD2A04 /* ggml.c in Sources */,
|
||||
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
|
||||
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
|
||||
18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */,
|
||||
18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */,
|
||||
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
|
||||
18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */,
|
||||
18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */,
|
||||
18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */,
|
||||
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
|
||||
18627C8C29052BE000BD2A04 /* main.m in Sources */,
|
||||
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
|
||||
18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */,
|
||||
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
|
||||
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */,
|
||||
);
|
||||
@ -335,6 +363,8 @@
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
HEADER_SEARCH_PATHS = ../../../ggml/src/;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||
MTL_FAST_MATH = YES;
|
||||
@ -388,6 +418,8 @@
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
HEADER_SEARCH_PATHS = ../../../ggml/src/;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
MTL_ENABLE_DEBUG_INFO = NO;
|
||||
MTL_FAST_MATH = YES;
|
||||
@ -400,64 +432,6 @@
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
18627C9029052BE000BD2A04 /* Debug */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_TEAM = P8JZH34X63;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_FILE = whisper.objc/Info.plist;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
|
||||
INFOPLIST_KEY_UIMainStoryboardFile = Main;
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
LD_RUNPATH_SEARCH_PATHS = (
|
||||
"$(inherited)",
|
||||
"@executable_path/Frameworks",
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
MTL_HEADER_SEARCH_PATHS = "";
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
18627C9129052BE000BD2A04 /* Release */ = {
|
||||
isa = XCBuildConfiguration;
|
||||
buildSettings = {
|
||||
ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon;
|
||||
ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor;
|
||||
CODE_SIGN_STYLE = Automatic;
|
||||
CURRENT_PROJECT_VERSION = 1;
|
||||
DEVELOPMENT_TEAM = P8JZH34X63;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
INFOPLIST_FILE = whisper.objc/Info.plist;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
|
||||
INFOPLIST_KEY_UIMainStoryboardFile = Main;
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPad = "UIInterfaceOrientationPortrait UIInterfaceOrientationPortraitUpsideDown UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
INFOPLIST_KEY_UISupportedInterfaceOrientations_iPhone = "UIInterfaceOrientationPortrait UIInterfaceOrientationLandscapeLeft UIInterfaceOrientationLandscapeRight";
|
||||
LD_RUNPATH_SEARCH_PATHS = (
|
||||
"$(inherited)",
|
||||
"@executable_path/Frameworks",
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
MTL_HEADER_SEARCH_PATHS = "";
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
/* End XCBuildConfiguration section */
|
||||
|
||||
/* Begin XCConfigurationList section */
|
||||
@ -470,15 +444,6 @@
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
18627C8F29052BE000BD2A04 /* Build configuration list for PBXNativeTarget "whisper.objc" */ = {
|
||||
isa = XCConfigurationList;
|
||||
buildConfigurations = (
|
||||
18627C9029052BE000BD2A04 /* Debug */,
|
||||
18627C9129052BE000BD2A04 /* Release */,
|
||||
);
|
||||
defaultConfigurationIsVisible = 0;
|
||||
defaultConfigurationName = Release;
|
||||
};
|
||||
/* End XCConfigurationList section */
|
||||
};
|
||||
rootObject = 18627C6E29052BDF00BD2A04 /* Project object */;
|
||||
|
@ -67,8 +67,6 @@ actor WhisperContext {
|
||||
private func systemInfo() -> String {
|
||||
var info = ""
|
||||
if (ggml_cpu_has_neon() != 0) { info += "NEON " }
|
||||
if (ggml_cpu_has_metal() != 0) { info += "METAL " }
|
||||
if (ggml_cpu_has_blas() != 0) { info += "BLAS " }
|
||||
return String(info.dropLast())
|
||||
}
|
||||
|
||||
|
@ -92,6 +92,7 @@ else()
|
||||
endif()
|
||||
|
||||
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
||||
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||
|
||||
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
|
||||
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
|
||||
@ -116,6 +117,7 @@ endif()
|
||||
|
||||
# ggml core
|
||||
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
|
||||
option(GGML_CPU "ggml: enable CPU backend" ON)
|
||||
|
||||
# 3rd party libs / backends
|
||||
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
|
||||
@ -126,14 +128,9 @@ option(GGML_LLAMAFILE "ggml: use LLAMAFILE"
|
||||
|
||||
option(GGML_CUDA "ggml: use CUDA" OFF)
|
||||
option(GGML_MUSA "ggml: use MUSA" OFF)
|
||||
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
|
||||
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
|
||||
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
|
||||
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
|
||||
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
|
||||
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
|
||||
"ggml: iters./thread per block for Q2_K/Q6_K")
|
||||
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"ggml: max. batch size for using peer access")
|
||||
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
|
||||
@ -141,7 +138,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
|
||||
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
|
||||
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
|
||||
|
||||
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
|
||||
option(GGML_HIP "ggml: use HIP" OFF)
|
||||
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
|
||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||
@ -167,6 +164,8 @@ option(GGML_SYCL "ggml: use SYCL"
|
||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||
"ggml: sycl target device")
|
||||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||
"ggml: sycl device architecture")
|
||||
|
||||
# extra artifacts
|
||||
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
|
||||
@ -226,6 +225,7 @@ set(GGML_PUBLIC_HEADERS
|
||||
include/ggml-cann.h
|
||||
include/ggml-cuda.h
|
||||
include/ggml-kompute.h
|
||||
include/ggml-opt.h
|
||||
include/ggml-metal.h
|
||||
include/ggml-rpc.h
|
||||
include/ggml-sycl.h
|
||||
@ -235,15 +235,14 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
|
||||
#if (GGML_METAL)
|
||||
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
|
||||
#endif()
|
||||
install(TARGETS ggml PUBLIC_HEADER)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
install(TARGETS ggml LIBRARY)
|
||||
endif()
|
||||
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
|
||||
install(TARGETS ggml-base LIBRARY)
|
||||
|
||||
# FIXME: this should be done in the backend cmake files
|
||||
if (GGML_METAL)
|
||||
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
|
||||
install(
|
||||
FILES src/ggml-metal.metal
|
||||
FILES src/ggml-metal/ggml-metal.metal
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
|
@ -1,220 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import logging
|
||||
import argparse
|
||||
import asyncio
|
||||
import os
|
||||
from tempfile import gettempdir
|
||||
|
||||
logger = logging.getLogger("ggml-vk-generate-shaders")
|
||||
|
||||
GLSLC = "glslc"
|
||||
|
||||
type_names = [
|
||||
"f32",
|
||||
"f16",
|
||||
"q4_0",
|
||||
"q4_1",
|
||||
"q5_0",
|
||||
"q5_1",
|
||||
"q8_0",
|
||||
"q2_k",
|
||||
"q3_k",
|
||||
"q4_k",
|
||||
"q5_k",
|
||||
"q6_k",
|
||||
]
|
||||
|
||||
ASYNCIO_CONCURRENCY = 64
|
||||
|
||||
input_dir = "vulkan-shaders"
|
||||
output_dir = gettempdir()
|
||||
|
||||
lock = asyncio.Lock()
|
||||
shader_fnames = []
|
||||
|
||||
|
||||
async def string_to_spv(name, in_fname, defines, fp16=True):
|
||||
name = f"{name}{'_fp32' if not fp16 else ''}"
|
||||
out_fname = os.path.join(output_dir, f"{name}.spv")
|
||||
|
||||
in_path = os.path.join(input_dir, in_fname)
|
||||
|
||||
cmd = [GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname]
|
||||
|
||||
cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
|
||||
|
||||
proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
|
||||
|
||||
stdout, stderr = await proc.communicate()
|
||||
|
||||
stdout = stdout.decode()
|
||||
error = stderr.decode()
|
||||
|
||||
if proc.returncode:
|
||||
cmd = " ".join(cmd)
|
||||
logger.error(f"cannot compile {name}\n\n{cmd}\n\n{error}")
|
||||
return
|
||||
|
||||
async with lock:
|
||||
shader_fnames.append((name, out_fname))
|
||||
|
||||
|
||||
def matmul_shaders(tasks, fp16, matmul_id):
|
||||
if fp16:
|
||||
load_vec = "8"
|
||||
aligned_b_type_f32 = "mat2x4"
|
||||
aligned_b_type_f16 = "f16mat2x4"
|
||||
else:
|
||||
load_vec = "4"
|
||||
aligned_b_type_f32 = "vec4"
|
||||
aligned_b_type_f16 = "f16vec4"
|
||||
|
||||
base_dict = {"FLOAT_TYPE": "float" if not fp16 else "float16_t"}
|
||||
shader_name = "matmul"
|
||||
|
||||
if matmul_id:
|
||||
base_dict["MUL_MAT_ID"] = "1"
|
||||
shader_name = "matmul_id"
|
||||
|
||||
if fp16:
|
||||
base_dict["FLOAT16"] = "1"
|
||||
|
||||
# Shaders with f16 B_TYPE
|
||||
tasks.append(string_to_spv(f"{shader_name}_f32_f16", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv(f"{shader_name}_f32_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
|
||||
|
||||
tasks.append(string_to_spv(f"{shader_name}_f16", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv(f"{shader_name}_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
|
||||
|
||||
for tname in type_names:
|
||||
data_a_key = f"DATA_A_{tname.upper()}"
|
||||
load_vec_a = load_vec if tname in ("f32", "f16") else "2"
|
||||
tasks.append(string_to_spv(f"{shader_name}_{tname}_f32", "mul_mm.comp", base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
|
||||
tasks.append(string_to_spv(f"{shader_name}_{tname}_f32_aligned", "mul_mm.comp", base_dict | {data_a_key: "2", "LOAD_VEC_A": load_vec_a, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f32, "D_TYPE": "float"}, fp16))
|
||||
|
||||
|
||||
async def main():
|
||||
logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")
|
||||
|
||||
tasks = []
|
||||
|
||||
for fp16 in (False, True):
|
||||
# MUL_MAT
|
||||
matmul_shaders(tasks, fp16, False)
|
||||
# MUL_MAT_ID
|
||||
matmul_shaders(tasks, fp16, True)
|
||||
|
||||
for tname in type_names:
|
||||
base_dict = {"FLOAT_TYPE": "float"}
|
||||
|
||||
# mul mat vec
|
||||
data_a_key = f"DATA_A_{tname.upper()}"
|
||||
shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp"
|
||||
|
||||
tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f32_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f16_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float16_t", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv(f"mul_mat_vec_id_{tname}_f32", shader, base_dict | {"MUL_MAT_ID": "1", data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
# Dequant shaders
|
||||
if tname != "f16":
|
||||
tasks.append(string_to_spv(f"dequant_{tname}", f"dequant_{tname}.comp", base_dict | {data_a_key: "1", "D_TYPE": "float16_t"}))
|
||||
|
||||
# get_rows
|
||||
if not tname.endswith("_k"):
|
||||
shader = "get_rows.comp" if tname in ("f32", "f16") else "get_rows_quant.comp"
|
||||
|
||||
if tname == "f16":
|
||||
tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
|
||||
else:
|
||||
tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv(f"get_rows_{tname}_f32", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
# Norms
|
||||
tasks.append(string_to_spv("norm_f32", "norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rms_norm_f32", "rms_norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("cpy_f32_f32", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("cpy_f32_f16", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
|
||||
tasks.append(string_to_spv("cpy_f16_f16", "copy.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
|
||||
|
||||
tasks.append(string_to_spv("add_f32", "add.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}))
|
||||
|
||||
tasks.append(string_to_spv("mul_f32", "mul.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("div_f32", "div.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("scale_f32", "scale.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("sqr_f32", "square.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("clamp_f32", "clamp.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("gelu_f32", "gelu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("silu_f32", "silu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("relu_f32", "relu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("soft_max_f32", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("soft_max_f32_f16", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float16_t", "D_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("rope_norm_f32", "rope_norm.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_norm_f16", "rope_norm.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
|
||||
tasks.append(string_to_spv("rope_neox_f32", "rope_neox.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
tasks.append(string_to_spv("rope_neox_f16", "rope_neox.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
|
||||
|
||||
tasks.append(string_to_spv("argsort_f32", "argsort.comp", {"A_TYPE": "float"}))
|
||||
|
||||
tasks.append(string_to_spv("sum_rows_f32", "sum_rows.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
|
||||
|
||||
# Helper to decorate tasks with semaphore acquisition.
|
||||
async def withSemaphore(sem, task):
|
||||
async with sem:
|
||||
return await task
|
||||
|
||||
# Run tasks concurrently guarded by a concurrency limit.
|
||||
sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
|
||||
await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
|
||||
|
||||
with open("ggml-vulkan-shaders.hpp", "w") as f:
|
||||
f.write("#include <cstdint>\n\n")
|
||||
for name, path in sorted(shader_fnames):
|
||||
|
||||
with open(path, "rb") as spv:
|
||||
counter = 0
|
||||
newline_counter = 0
|
||||
f.write(f"unsigned char {name}_data[] = {{\n")
|
||||
for val in spv.read():
|
||||
f.write(f"0x{val:02x},")
|
||||
newline_counter += 1
|
||||
counter += 1
|
||||
if newline_counter >= 12:
|
||||
newline_counter = 0
|
||||
f.write("\n")
|
||||
f.write("\n};\n")
|
||||
f.write(f"const uint64_t {name}_len = {counter};\n\n")
|
||||
os.remove(path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
|
||||
|
||||
parser.add_argument("--glslc", help="Path to glslc")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||
|
||||
if args.glslc:
|
||||
GLSLC = args.glslc
|
||||
|
||||
asyncio.run(main())
|
@ -9,16 +9,16 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// buffer_type API
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_amx(ggml_backend_t backend);
|
||||
GGML_BACKEND_API bool ggml_backend_is_amx(ggml_backend_t backend);
|
||||
|
||||
// backend API
|
||||
GGML_API ggml_backend_t ggml_backend_amx_init(void);
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_amx_init(void);
|
||||
|
||||
GGML_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
|
||||
GGML_BACKEND_API void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_amx_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_amx_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -3,6 +3,20 @@
|
||||
#include "ggml.h"
|
||||
#include "ggml-alloc.h"
|
||||
|
||||
#ifdef GGML_BACKEND_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef GGML_BACKEND_BUILD
|
||||
# define GGML_BACKEND_API __declspec(dllexport) extern
|
||||
# else
|
||||
# define GGML_BACKEND_API __declspec(dllimport) extern
|
||||
# endif
|
||||
# else
|
||||
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
|
||||
# endif
|
||||
#else
|
||||
# define GGML_BACKEND_API extern
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -72,7 +86,7 @@ extern "C" {
|
||||
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
|
||||
// "offset" refers to the offset of the tensor data for setting/getting data
|
||||
// "offset" refers to the offset in tensor->data for setting/getting data
|
||||
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
|
||||
@ -228,14 +242,20 @@ extern "C" {
|
||||
ggml_backend_sched_reserve(sched, reserve_graph);
|
||||
|
||||
// compute
|
||||
graph = build_graph(sched);
|
||||
ggml_backend_sched_graph_compute(sched, graph);
|
||||
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
|
||||
for (int i = 0; i < 10; ++i) {
|
||||
ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
|
||||
}
|
||||
|
||||
// if there are graph inputs:
|
||||
ggml_backend_sched_reset(sched);
|
||||
ggml_backend_sched_alloc_graph(sched, graph);
|
||||
ggml_backend_tensor_set(input_tensor, ...);
|
||||
ggml_backend_sched_graph_compute(sched, graph);
|
||||
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
|
||||
ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
|
||||
ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
|
||||
ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
|
||||
ggml_backend_sched_graph_compute(sched, graph); // execute the graph
|
||||
|
||||
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
|
||||
// allocate them statically via ggml_backend_alloc_ctx_tensors
|
||||
}
|
||||
*/
|
||||
|
||||
@ -250,7 +270,7 @@ extern "C" {
|
||||
//
|
||||
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
|
||||
|
||||
// Initialize a backend scheduler
|
||||
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
|
||||
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
|
||||
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
|
||||
|
||||
@ -275,7 +295,9 @@ extern "C" {
|
||||
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
|
||||
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
|
||||
|
||||
// Reset all assignments and allocators - must be called before changing the node backends
|
||||
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
|
||||
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
|
||||
// The correct way to use this API is to discard the deallocated tensors and create new ones.
|
||||
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
|
||||
|
||||
// Set a callback to be called for each resulting node during graph compute
|
||||
|
@ -9,15 +9,15 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// backend API
|
||||
GGML_API ggml_backend_t ggml_backend_blas_init(void);
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
||||
GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
|
||||
|
||||
// number of threads used for conversion to float
|
||||
// for openblas and blis, this will also set the number of threads used for blas operations
|
||||
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
||||
GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
@ -34,7 +34,7 @@ extern "C" {
|
||||
*/
|
||||
#define GGML_CANN_MAX_DEVICES 16
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
||||
|
||||
/**
|
||||
* @brief Initializes the CANN backend for a specified device.
|
||||
@ -46,7 +46,7 @@ GGML_API ggml_backend_reg_t ggml_backend_cann_reg(void);
|
||||
* @param device The index of the device to initialize.
|
||||
* @return A pointer to the initialized backend instance, or nullptr on failure.
|
||||
*/
|
||||
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||
|
||||
/**
|
||||
* @brief Checks if a given backend is a CANN backend.
|
||||
@ -57,7 +57,7 @@ GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
|
||||
* @param backend The backend instance to check.
|
||||
* @return True if the backend is a CANN backend, false otherwise.
|
||||
*/
|
||||
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||
GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the CANN buffer type for a specified device.
|
||||
@ -69,7 +69,7 @@ GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
|
||||
* @return A pointer to the buffer type interface for the specified device, or
|
||||
* nullptr if the device index is out of range.
|
||||
*/
|
||||
GGML_API ggml_backend_buffer_type_t
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t
|
||||
ggml_backend_cann_buffer_type(int32_t device);
|
||||
|
||||
/**
|
||||
@ -80,14 +80,14 @@ ggml_backend_cann_buffer_type(int32_t device);
|
||||
*
|
||||
* @return The number of CANN devices available.
|
||||
*/
|
||||
GGML_API int32_t ggml_backend_cann_get_device_count(void);
|
||||
GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
|
||||
|
||||
/**
|
||||
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
|
||||
*
|
||||
* @return A pointer to the host buffer type interface.
|
||||
*/
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||
|
||||
/**
|
||||
* @brief Retrieves the description of a specific CANN device.
|
||||
@ -99,7 +99,7 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
|
||||
* @param description Pointer to a buffer where the description will be written.
|
||||
* @param description_size Size of the description buffer.
|
||||
*/
|
||||
GGML_API void ggml_backend_cann_get_device_description(
|
||||
GGML_BACKEND_API void ggml_backend_cann_get_device_description(
|
||||
int32_t device, char* description, size_t description_size);
|
||||
|
||||
/**
|
||||
@ -114,7 +114,7 @@ GGML_API void ggml_backend_cann_get_device_description(
|
||||
* @param total Pointer to a variable where the total memory size will be
|
||||
* stored.
|
||||
*/
|
||||
GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
|
||||
GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
|
||||
size_t* free,
|
||||
size_t* total);
|
||||
|
||||
|
@ -54,54 +54,77 @@ extern "C" {
|
||||
GGML_NUMA_STRATEGY_COUNT
|
||||
};
|
||||
|
||||
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
|
||||
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
|
||||
GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
|
||||
|
||||
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||
GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
|
||||
|
||||
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
|
||||
GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
|
||||
|
||||
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
|
||||
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
|
||||
|
||||
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
|
||||
GGML_BACKEND_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
|
||||
GGML_BACKEND_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
|
||||
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
|
||||
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
|
||||
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
|
||||
|
||||
// ggml_graph_plan() has to be called before ggml_graph_compute()
|
||||
// when plan.work_size > 0, caller must allocate memory for plan.work_data
|
||||
GGML_API struct ggml_cplan ggml_graph_plan(
|
||||
GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
|
||||
const struct ggml_cgraph * cgraph,
|
||||
int n_threads, /* = GGML_DEFAULT_N_THREADS */
|
||||
struct ggml_threadpool * threadpool /* = NULL */ );
|
||||
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
|
||||
|
||||
// same as ggml_graph_compute() but the work data is allocated as a part of the context
|
||||
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
|
||||
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
|
||||
|
||||
// TODO: move to backend interface
|
||||
GGML_API int ggml_cpu_has_neon (void);
|
||||
GGML_API int ggml_cpu_has_sve (void);
|
||||
GGML_API int ggml_cpu_has_matmul_int8(void);
|
||||
// get the sve vector length in bytes
|
||||
GGML_API int ggml_cpu_get_sve_cnt(void);
|
||||
//
|
||||
// system info
|
||||
//
|
||||
|
||||
// x86
|
||||
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_fma (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
|
||||
// ARM
|
||||
GGML_BACKEND_API int ggml_cpu_has_neon (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_sve (void);
|
||||
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
|
||||
// other
|
||||
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
// Internal types and functions exposed for tests and benchmarks
|
||||
|
||||
@ -115,6 +138,7 @@ extern "C" {
|
||||
const void * GGML_RESTRICT y, int nr, int nc);
|
||||
|
||||
struct ggml_type_traits_cpu {
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_to_mat_t from_float_to_mat;
|
||||
ggml_vec_dot_t vec_dot;
|
||||
enum ggml_type vec_dot_type;
|
||||
@ -124,27 +148,30 @@ extern "C" {
|
||||
ggml_gemm_t gemm;
|
||||
};
|
||||
|
||||
GGML_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
|
||||
|
||||
GGML_API void ggml_cpu_init(void);
|
||||
GGML_BACKEND_API void ggml_cpu_init(void);
|
||||
|
||||
//
|
||||
// CPU backend
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
|
||||
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||
#endif
|
||||
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
|
||||
GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -7,7 +7,7 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_HIPBLAS
|
||||
#ifdef GGML_USE_HIP
|
||||
#define GGML_CUDA_NAME "ROCm"
|
||||
#define GGML_CUBLAS_NAME "hipBLAS"
|
||||
#elif defined(GGML_USE_MUSA)
|
||||
@ -20,27 +20,27 @@ extern "C" {
|
||||
#define GGML_CUDA_MAX_DEVICES 16
|
||||
|
||||
// backend API
|
||||
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
|
||||
|
||||
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
|
||||
|
||||
// device buffer
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
|
||||
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
|
||||
|
||||
GGML_API int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void);
|
||||
GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
||||
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||
GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
|
||||
GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -37,13 +37,13 @@ struct ggml_vk_device ggml_vk_current_device(void);
|
||||
// forward declaration
|
||||
typedef struct ggml_backend * ggml_backend_t;
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
|
||||
|
||||
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -39,27 +39,27 @@ extern "C" {
|
||||
// user-code should use only these functions
|
||||
//
|
||||
|
||||
GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
|
||||
|
||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||
|
||||
GGML_DEPRECATED(
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
||||
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
|
||||
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
|
||||
|
||||
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||
|
||||
// helper to check if the device supports a specific family
|
||||
// ideally, the user code should be doing these checks
|
||||
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
|
||||
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||
GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
|
||||
|
||||
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
|
||||
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||
GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_metal_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
216
ggml/include/ggml-opt.h
Normal file
216
ggml/include/ggml-opt.h
Normal file
@ -0,0 +1,216 @@
|
||||
// This file contains functionality for training models using GGML.
|
||||
// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
|
||||
// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
|
||||
//
|
||||
// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-backend.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
struct ggml_opt_dataset;
|
||||
struct ggml_opt_context;
|
||||
struct ggml_opt_result;
|
||||
|
||||
typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
|
||||
typedef struct ggml_opt_context * ggml_opt_context_t;
|
||||
typedef struct ggml_opt_result * ggml_opt_result_t;
|
||||
|
||||
// ====== Loss ======
|
||||
|
||||
// built-in loss types, i.e. the built-in quantities minimized by the optimizer
|
||||
// custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
|
||||
enum ggml_opt_loss_type {
|
||||
GGML_OPT_LOSS_TYPE_MEAN,
|
||||
GGML_OPT_LOSS_TYPE_SUM,
|
||||
GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
|
||||
GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
|
||||
};
|
||||
|
||||
// ====== Dataset ======
|
||||
|
||||
GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
|
||||
int64_t ne_datapoint, // number of elements per datapoint
|
||||
int64_t ne_label, // number of elements per label
|
||||
int64_t ndata, // total number of datapoints/labels
|
||||
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
|
||||
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
|
||||
|
||||
// get underlying tensors that store the data
|
||||
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
|
||||
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
|
||||
|
||||
// shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
|
||||
GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
|
||||
|
||||
// get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
|
||||
GGML_API void ggml_opt_dataset_get_batch(
|
||||
ggml_opt_dataset_t dataset,
|
||||
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
|
||||
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
|
||||
int64_t ibatch);
|
||||
|
||||
// ====== Model / Context ======
|
||||
|
||||
enum ggml_opt_build_type {
|
||||
GGML_OPT_BUILD_TYPE_FORWARD,
|
||||
GGML_OPT_BUILD_TYPE_GRAD,
|
||||
GGML_OPT_BUILD_TYPE_OPT,
|
||||
};
|
||||
|
||||
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
|
||||
struct ggml_opt_optimizer_params {
|
||||
// AdamW optimizer parameters
|
||||
struct {
|
||||
float alpha; // learning rate
|
||||
float beta1;
|
||||
float beta2;
|
||||
float eps; // epsilon for numerical stability
|
||||
float wd; // weight decay for AdamW, use 0.0f to disable
|
||||
} adamw;
|
||||
};
|
||||
|
||||
// callback to calculate optimizer parameters prior to a backward pass
|
||||
// userdata can be used to pass arbitrary data
|
||||
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
|
||||
|
||||
// returns the default optimizer params (constant)
|
||||
// userdata is not used
|
||||
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
|
||||
|
||||
// parameters for initializing a new optimization context
|
||||
struct ggml_opt_params {
|
||||
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
|
||||
|
||||
struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
|
||||
|
||||
// the forward graph is defined by inputs and outputs
|
||||
// those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
|
||||
struct ggml_tensor * inputs;
|
||||
struct ggml_tensor * outputs;
|
||||
|
||||
enum ggml_opt_loss_type loss_type;
|
||||
enum ggml_opt_build_type build_type;
|
||||
|
||||
int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
|
||||
|
||||
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
|
||||
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
|
||||
};
|
||||
|
||||
// get parameters for an optimization context with defaults set where possible
|
||||
// parameters for which no sensible defaults exist are supplied as arguments to this function
|
||||
GGML_API ggml_opt_params ggml_opt_default_params(
|
||||
ggml_backend_sched_t backend_sched,
|
||||
struct ggml_context * ctx_compute,
|
||||
struct ggml_tensor * inputs,
|
||||
struct ggml_tensor * outputs,
|
||||
enum ggml_opt_loss_type loss_type);
|
||||
|
||||
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
|
||||
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
|
||||
|
||||
// set gradients to zero, initilize loss, and optionally reset the optimizer
|
||||
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
|
||||
|
||||
// get underlying tensors that store data
|
||||
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
|
||||
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
|
||||
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
|
||||
GGML_API struct ggml_tensor * ggml_opt_loss( ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
|
||||
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
|
||||
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
|
||||
|
||||
// ====== Optimization Result ======
|
||||
|
||||
GGML_API ggml_opt_result_t ggml_opt_result_init();
|
||||
GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
|
||||
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
|
||||
|
||||
// get data from result, uncertainties are optional and can be ignored by passing NULL
|
||||
GGML_API void ggml_opt_result_ndata( ggml_opt_result_t result, int64_t * ndata); // writes 1 value, number of datapoints
|
||||
GGML_API void ggml_opt_result_loss( ggml_opt_result_t result, double * loss, double * unc); // writes 1 value
|
||||
GGML_API void ggml_opt_result_pred( ggml_opt_result_t result, int32_t * pred); // writes ndata values
|
||||
GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc); // writes 1 value
|
||||
|
||||
// ====== Computation ======
|
||||
|
||||
// do forward pass, increment result if not NULL
|
||||
GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
|
||||
|
||||
// do forward pass, increment result if not NULL, do backward pass
|
||||
GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
|
||||
|
||||
// ############################################################################
|
||||
// ## The high-level functions start here. They do not depend on any private ##
|
||||
// ## functions or structs and can be copied to and adapted for user code. ##
|
||||
// ############################################################################
|
||||
|
||||
// ====== Intended Usage ======
|
||||
//
|
||||
// 1. Select the appropriate loss for your problem.
|
||||
// 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
|
||||
// Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
|
||||
// 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
|
||||
// The first context should contain the model parameters and inputs and be allocated statically in user code.
|
||||
// The second context should contain all other tensors and will be (re)allocated automatically.
|
||||
// Due to this automated allocation the data of the second context is not defined when accessed in user code.
|
||||
// Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
|
||||
// 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
|
||||
|
||||
// signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
|
||||
typedef void (*ggml_opt_epoch_callback)(
|
||||
bool train, // true after training evaluation, false after validation evaluation
|
||||
ggml_opt_context_t opt_ctx,
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result, // result associated with the dataset subsection
|
||||
int64_t ibatch, // number of batches that have been evaluated so far
|
||||
int64_t ibatch_max, // total number of batches in this dataset subsection
|
||||
int64_t t_start_us); // time at which the evaluation on the dataset subsection was started
|
||||
|
||||
// do training on front of dataset, do evaluation only on back of dataset
|
||||
GGML_API void ggml_opt_epoch(
|
||||
ggml_opt_context_t opt_ctx,
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result_train, // result to increment during training, ignored if NULL
|
||||
ggml_opt_result_t result_eval, // result to increment during evaluation, ignored if NULL
|
||||
int64_t idata_split, // data index at which to split training and evaluation
|
||||
ggml_opt_epoch_callback callback_train,
|
||||
ggml_opt_epoch_callback callback_eval);
|
||||
|
||||
// callback that prints a progress bar on stderr
|
||||
GGML_API void ggml_opt_epoch_callback_progress_bar(
|
||||
bool train,
|
||||
ggml_opt_context_t opt_ctx,
|
||||
ggml_opt_dataset_t dataset,
|
||||
ggml_opt_result_t result,
|
||||
int64_t ibatch,
|
||||
int64_t ibatch_max,
|
||||
int64_t t_start_us);
|
||||
|
||||
// fit model defined by inputs and outputs to dataset
|
||||
GGML_API void ggml_opt_fit(
|
||||
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
|
||||
ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
|
||||
ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
|
||||
ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
|
||||
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
|
||||
enum ggml_opt_loss_type loss_type, // loss to minimize
|
||||
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
|
||||
int64_t nepoch, // how many times the dataset should be iterated over
|
||||
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
|
||||
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
|
||||
bool silent); // whether or not info prints to stderr should be suppressed
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -10,18 +10,18 @@ extern "C" {
|
||||
#define GGML_RPC_MAX_SERVERS 16
|
||||
|
||||
// backend API
|
||||
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
|
||||
GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
|
||||
|
||||
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||
|
||||
GGML_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
||||
|
||||
GGML_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
||||
GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -17,32 +17,32 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// backend API
|
||||
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
|
||||
|
||||
GGML_API bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||
GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
|
||||
|
||||
// devide buffer
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
|
||||
|
||||
// split tensor buffer that splits matrices by rows across multiple devices
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
|
||||
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
|
||||
|
||||
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||
GGML_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
|
||||
GGML_API void ggml_backend_sycl_get_device_description(int device,
|
||||
GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
|
||||
GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
|
||||
GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
|
||||
char *description,
|
||||
size_t description_size);
|
||||
GGML_API int ggml_backend_sycl_get_device_count();
|
||||
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||
GGML_BACKEND_API int ggml_backend_sycl_get_device_count();
|
||||
GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
|
||||
|
||||
// SYCL doesn't support registering host memory, keep here for reference
|
||||
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
||||
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
||||
// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
|
||||
// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -10,21 +10,21 @@ extern "C" {
|
||||
#define GGML_VK_NAME "Vulkan"
|
||||
#define GGML_VK_MAX_DEVICES 16
|
||||
|
||||
GGML_API void ggml_vk_instance_init(void);
|
||||
GGML_BACKEND_API void ggml_vk_instance_init(void);
|
||||
|
||||
// backend API
|
||||
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
||||
GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
|
||||
|
||||
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
|
||||
GGML_API int ggml_backend_vk_get_device_count(void);
|
||||
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
||||
GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
|
||||
GGML_BACKEND_API int ggml_backend_vk_get_device_count(void);
|
||||
GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
|
||||
GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
|
||||
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
|
||||
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
|
||||
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
|
||||
|
||||
GGML_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
@ -176,15 +176,15 @@
|
||||
#ifdef GGML_SHARED
|
||||
# if defined(_WIN32) && !defined(__MINGW32__)
|
||||
# ifdef GGML_BUILD
|
||||
# define GGML_API __declspec(dllexport)
|
||||
# define GGML_API __declspec(dllexport) extern
|
||||
# else
|
||||
# define GGML_API __declspec(dllimport)
|
||||
# define GGML_API __declspec(dllimport) extern
|
||||
# endif
|
||||
# else
|
||||
# define GGML_API __attribute__ ((visibility ("default")))
|
||||
# define GGML_API __attribute__ ((visibility ("default"))) extern
|
||||
# endif
|
||||
#else
|
||||
# define GGML_API
|
||||
# define GGML_API extern
|
||||
#endif
|
||||
|
||||
// TODO: support for clang
|
||||
@ -602,7 +602,6 @@ extern "C" {
|
||||
|
||||
int32_t flags;
|
||||
|
||||
struct ggml_tensor * grad;
|
||||
struct ggml_tensor * src[GGML_MAX_SRC];
|
||||
|
||||
// source tensor and offset for views
|
||||
@ -615,7 +614,7 @@ extern "C" {
|
||||
|
||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||
|
||||
// char padding[4];
|
||||
char padding[8];
|
||||
};
|
||||
|
||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||
@ -1490,7 +1489,7 @@ extern "C" {
|
||||
"use ggml_rope_ext_inplace instead");
|
||||
|
||||
// compute correction dims for YaRN RoPE scaling
|
||||
void ggml_rope_yarn_corr_dims(
|
||||
GGML_API void ggml_rope_yarn_corr_dims(
|
||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
|
||||
|
||||
// rotary position embedding backward, i.e compute dx from dy
|
||||
@ -1985,28 +1984,20 @@ extern "C" {
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * grad,
|
||||
float alpha,
|
||||
float beta1,
|
||||
float beta2,
|
||||
float eps,
|
||||
float wd); // weight decay
|
||||
struct ggml_tensor * m,
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * adamw_params); // parameters such a the learning rate
|
||||
|
||||
//
|
||||
// automatic differentiation
|
||||
//
|
||||
|
||||
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
|
||||
|
||||
GGML_API void ggml_build_opt_adamw(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_cgraph * gb,
|
||||
float alpha,
|
||||
float beta1,
|
||||
float beta2,
|
||||
float eps,
|
||||
float wd); // weight decay
|
||||
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
||||
GGML_API void ggml_build_backward_expand(
|
||||
struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
|
||||
struct ggml_context * ctx_compute, // context for gradient computation
|
||||
struct ggml_cgraph * cgraph,
|
||||
bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
|
||||
|
||||
// graph allocation in a context
|
||||
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
|
||||
@ -2026,7 +2017,9 @@ extern "C" {
|
||||
GGML_API size_t ggml_graph_overhead(void);
|
||||
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name);
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
|
||||
GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
|
||||
|
||||
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
|
||||
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
|
||||
@ -2037,198 +2030,15 @@ extern "C" {
|
||||
// dump the graph into a file using the dot format
|
||||
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
|
||||
|
||||
// build gradient checkpointing backward graph gb for gf using provided checkpoints
|
||||
// gb_tmp will contain original backward graph with rewritten backward process nodes,
|
||||
// but without the second forward pass nodes.
|
||||
GGML_API void ggml_build_backward_gradient_checkpointing(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_cgraph * gb,
|
||||
struct ggml_cgraph * gb_tmp,
|
||||
struct ggml_tensor * * checkpoints,
|
||||
int n_checkpoints);
|
||||
//
|
||||
// optimization
|
||||
//
|
||||
|
||||
// optimization methods
|
||||
enum ggml_opt_type {
|
||||
GGML_OPT_TYPE_ADAM,
|
||||
GGML_OPT_TYPE_LBFGS,
|
||||
};
|
||||
|
||||
// linesearch methods
|
||||
enum ggml_linesearch {
|
||||
GGML_LINESEARCH_DEFAULT = 1,
|
||||
|
||||
GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
|
||||
GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
|
||||
GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
|
||||
};
|
||||
|
||||
// optimization return values
|
||||
enum ggml_opt_result {
|
||||
GGML_OPT_RESULT_OK = 0,
|
||||
GGML_OPT_RESULT_DID_NOT_CONVERGE,
|
||||
GGML_OPT_RESULT_NO_CONTEXT,
|
||||
GGML_OPT_RESULT_INVALID_WOLFE,
|
||||
GGML_OPT_RESULT_FAIL,
|
||||
GGML_OPT_RESULT_CANCEL,
|
||||
|
||||
GGML_LINESEARCH_FAIL = -128,
|
||||
GGML_LINESEARCH_MINIMUM_STEP,
|
||||
GGML_LINESEARCH_MAXIMUM_STEP,
|
||||
GGML_LINESEARCH_MAXIMUM_ITERATIONS,
|
||||
GGML_LINESEARCH_INVALID_PARAMETERS,
|
||||
};
|
||||
|
||||
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
|
||||
// TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
|
||||
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
|
||||
|
||||
// Set callback for all future logging events.
|
||||
// If this is not called, or NULL is supplied, everything is output on stderr.
|
||||
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
|
||||
|
||||
// optimization parameters
|
||||
//
|
||||
// see ggml.c (ggml_opt_default_params) for default values
|
||||
//
|
||||
struct ggml_opt_params {
|
||||
enum ggml_opt_type type;
|
||||
|
||||
size_t graph_size;
|
||||
|
||||
int n_threads;
|
||||
|
||||
// delta-based convergence test
|
||||
//
|
||||
// if past == 0 - disabled
|
||||
// if past > 0:
|
||||
// stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
|
||||
//
|
||||
int past;
|
||||
float delta;
|
||||
|
||||
// maximum number of iterations without improvement
|
||||
//
|
||||
// if 0 - disabled
|
||||
// if > 0:
|
||||
// assume convergence if no cost improvement in this number of iterations
|
||||
//
|
||||
int max_no_improvement;
|
||||
|
||||
bool print_forward_graph;
|
||||
bool print_backward_graph;
|
||||
|
||||
int n_gradient_accumulation;
|
||||
|
||||
// ADAM parameters
|
||||
struct {
|
||||
int n_iter;
|
||||
|
||||
float sched; // schedule multiplier (fixed, decay or warmup)
|
||||
float decay; // weight decay for AdamW, use 0.0f to disable
|
||||
int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
|
||||
float alpha; // learning rate
|
||||
float beta1;
|
||||
float beta2;
|
||||
float eps; // epsilon for numerical stability
|
||||
float eps_f; // epsilon for convergence test
|
||||
float eps_g; // epsilon for convergence test
|
||||
float gclip; // gradient clipping
|
||||
} adam;
|
||||
|
||||
// LBFGS parameters
|
||||
struct {
|
||||
int m; // number of corrections to approximate the inv. Hessian
|
||||
int n_iter;
|
||||
int max_linesearch;
|
||||
|
||||
float eps; // convergence tolerance
|
||||
float ftol; // line search tolerance
|
||||
float wolfe;
|
||||
float min_step;
|
||||
float max_step;
|
||||
|
||||
enum ggml_linesearch linesearch;
|
||||
} lbfgs;
|
||||
};
|
||||
|
||||
struct ggml_opt_context {
|
||||
struct ggml_context * ctx;
|
||||
struct ggml_opt_params params;
|
||||
|
||||
int iter;
|
||||
int64_t nx; // number of parameter elements
|
||||
|
||||
bool just_initialized;
|
||||
|
||||
float loss_before;
|
||||
float loss_after;
|
||||
|
||||
struct {
|
||||
struct ggml_tensor * g; // current gradient
|
||||
struct ggml_tensor * m; // first moment
|
||||
struct ggml_tensor * v; // second moment
|
||||
struct ggml_tensor * pf; // past function values
|
||||
float fx_best;
|
||||
float fx_prev;
|
||||
int n_no_improvement;
|
||||
} adam;
|
||||
|
||||
struct {
|
||||
struct ggml_tensor * x; // current parameters
|
||||
struct ggml_tensor * xp; // previous parameters
|
||||
struct ggml_tensor * g; // current gradient
|
||||
struct ggml_tensor * gp; // previous gradient
|
||||
struct ggml_tensor * d; // search direction
|
||||
struct ggml_tensor * pf; // past function values
|
||||
struct ggml_tensor * lmal; // the L-BFGS memory alpha
|
||||
struct ggml_tensor * lmys; // the L-BFGS memory ys
|
||||
struct ggml_tensor * lms; // the L-BFGS memory s
|
||||
struct ggml_tensor * lmy; // the L-BFGS memory y
|
||||
float fx_best;
|
||||
float step;
|
||||
int j;
|
||||
int k;
|
||||
int end;
|
||||
int n_no_improvement;
|
||||
} lbfgs;
|
||||
};
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||
|
||||
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
|
||||
|
||||
// optimize the function defined by the tensor f
|
||||
GGML_API enum ggml_opt_result ggml_opt(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_params params,
|
||||
struct ggml_tensor * f);
|
||||
|
||||
// initialize optimizer context
|
||||
GGML_API void ggml_opt_init(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_context * opt,
|
||||
struct ggml_opt_params params,
|
||||
int64_t nx);
|
||||
|
||||
// continue optimizing the function defined by the tensor f
|
||||
GGML_API enum ggml_opt_result ggml_opt_resume(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_context * opt,
|
||||
struct ggml_tensor * f);
|
||||
|
||||
// continue optimizing the function defined by the tensor f
|
||||
GGML_API enum ggml_opt_result ggml_opt_resume_g(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_opt_context * opt,
|
||||
struct ggml_tensor * f,
|
||||
struct ggml_cgraph * gf,
|
||||
struct ggml_cgraph * gb,
|
||||
ggml_opt_callback callback,
|
||||
void * callback_data);
|
||||
|
||||
//
|
||||
// quantization
|
||||
//
|
||||
@ -2384,38 +2194,6 @@ extern "C" {
|
||||
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
|
||||
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
|
||||
|
||||
//
|
||||
// system info
|
||||
//
|
||||
|
||||
GGML_API int ggml_cpu_has_avx (void);
|
||||
GGML_API int ggml_cpu_has_avx_vnni (void);
|
||||
GGML_API int ggml_cpu_has_avx2 (void);
|
||||
GGML_API int ggml_cpu_has_avx512 (void);
|
||||
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
||||
GGML_API int ggml_cpu_has_avx512_vnni(void);
|
||||
GGML_API int ggml_cpu_has_avx512_bf16(void);
|
||||
GGML_API int ggml_cpu_has_amx_int8 (void);
|
||||
GGML_API int ggml_cpu_has_fma (void);
|
||||
GGML_API int ggml_cpu_has_arm_fma (void);
|
||||
GGML_API int ggml_cpu_has_metal (void);
|
||||
GGML_API int ggml_cpu_has_f16c (void);
|
||||
GGML_API int ggml_cpu_has_fp16_va (void);
|
||||
GGML_API int ggml_cpu_has_wasm_simd (void);
|
||||
GGML_API int ggml_cpu_has_blas (void);
|
||||
GGML_API int ggml_cpu_has_cuda (void);
|
||||
GGML_API int ggml_cpu_has_vulkan (void);
|
||||
GGML_API int ggml_cpu_has_kompute (void);
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_API int ggml_cpu_has_riscv_v (void);
|
||||
GGML_API int ggml_cpu_has_sycl (void);
|
||||
GGML_API int ggml_cpu_has_rpc (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
GGML_API int ggml_cpu_has_cann (void);
|
||||
GGML_API int ggml_cpu_has_llamafile (void);
|
||||
|
||||
#ifdef __cplusplus
|
||||
// restrict not standard in C++
|
||||
#define GGML_RESTRICT
|
||||
@ -2432,7 +2210,6 @@ extern "C" {
|
||||
size_t type_size;
|
||||
bool is_quantized;
|
||||
ggml_to_float_t to_float;
|
||||
ggml_from_float_t from_float;
|
||||
ggml_from_float_t from_float_ref;
|
||||
};
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,9 +1,5 @@
|
||||
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
@ -12,27 +8,11 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
||||
|
||||
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
|
||||
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
|
||||
|
||||
// GEMV
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// GEMM
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -466,18 +466,12 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||
return ggml_gallocr_hash_get(galloc, t)->allocated;
|
||||
}
|
||||
|
||||
static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||
hn->buffer_id = buffer_id;
|
||||
hn->offset = offset;
|
||||
hn->allocated = true;
|
||||
}
|
||||
|
||||
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
|
||||
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
|
||||
}
|
||||
|
||||
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
|
||||
GGML_ASSERT(buffer_id >= 0);
|
||||
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
|
||||
|
||||
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
|
||||
@ -816,7 +810,11 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
|
||||
}
|
||||
|
||||
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
|
||||
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
||||
size_t node_size = 0;
|
||||
if (!node->data && !node->view_src) {
|
||||
GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
|
||||
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
|
||||
}
|
||||
return talloc->size_max >= node_size;
|
||||
}
|
||||
|
||||
|
107
ggml/src/ggml-amx/CMakeLists.txt
Normal file
107
ggml/src/ggml-amx/CMakeLists.txt
Normal file
@ -0,0 +1,107 @@
|
||||
if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
|
||||
CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
|
||||
message(STATUS "Using AMX")
|
||||
|
||||
file(GLOB GGML_HEADERS_AMX "*.h")
|
||||
list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
|
||||
|
||||
file(GLOB GGML_SOURCES_AMX "*.cpp")
|
||||
|
||||
add_library(ggml-amx
|
||||
${GGML_HEADERS_AMX}
|
||||
${GGML_SOURCES_AMX})
|
||||
|
||||
target_link_libraries(ggml-amx PRIVATE ggml-base)
|
||||
target_include_directories(ggml-amx PRIVATE . ..)
|
||||
|
||||
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
|
||||
# TODO: integrate AMX backend into the CPU backend
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (GGML_NATIVE)
|
||||
# TODO: improve, should not reference files from the parent folder
|
||||
include(../ggml-cpu/cmake/FindSIMD.cmake)
|
||||
endif ()
|
||||
if (GGML_AVX512)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX512)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
# Do it manually.
|
||||
if (GGML_AVX512_VBMI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
||||
endif()
|
||||
if (GGML_AVX512_VNNI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
endif()
|
||||
if (GGML_AVX512_BF16)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
||||
endif()
|
||||
if (GGML_AMX_TILE)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
|
||||
endif()
|
||||
if (GGML_AMX_INT8)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
|
||||
endif()
|
||||
if (GGML_AMX_BF16)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
|
||||
endif()
|
||||
elseif (GGML_AVX2)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX2)
|
||||
elseif (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
endif()
|
||||
else()
|
||||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
endif()
|
||||
if (GGML_F16C)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
endif()
|
||||
if (GGML_FMA)
|
||||
list(APPEND ARCH_FLAGS -mfma)
|
||||
endif()
|
||||
if (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS -mavx)
|
||||
endif()
|
||||
if (GGML_AVX2)
|
||||
list(APPEND ARCH_FLAGS -mavx2)
|
||||
endif()
|
||||
if (GGML_AVX512)
|
||||
list(APPEND ARCH_FLAGS -mavx512f)
|
||||
list(APPEND ARCH_FLAGS -mavx512dq)
|
||||
list(APPEND ARCH_FLAGS -mavx512bw)
|
||||
endif()
|
||||
if (GGML_AVX512_VBMI)
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
if (GGML_AVX512_VNNI)
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
if (GGML_AVX512_BF16)
|
||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
||||
endif()
|
||||
if (GGML_AMX_TILE)
|
||||
list(APPEND ARCH_FLAGS -mamx-tile)
|
||||
endif()
|
||||
if (GGML_AMX_INT8)
|
||||
list(APPEND ARCH_FLAGS -mamx-int8)
|
||||
endif()
|
||||
if (GGML_AMX_BF16)
|
||||
list(APPEND ARCH_FLAGS -mamx-bf16)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
|
||||
else()
|
||||
set(GGML_AMX OFF PARENT_SCOPE)
|
||||
message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
|
||||
endif()
|
@ -1,7 +1,8 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu-impl.h" // <immintrin.h>
|
||||
// hack until AMX is moved into the CPU backend
|
||||
#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <memory>
|
||||
|
@ -317,8 +317,6 @@ static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const st
|
||||
const enum ggml_type type = src0->type;
|
||||
const int64_t ne0 = op->ne[0];
|
||||
|
||||
bool is_training = src0->grad || src1->grad;
|
||||
|
||||
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
|
||||
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
|
||||
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
|
||||
@ -326,7 +324,6 @@ static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const st
|
||||
bool can_use_amx =
|
||||
is_contiguous_2d(src0) && // src0 must be contiguous
|
||||
is_contiguous_2d(src1) && // src1 must be contiguous
|
||||
!is_training && // inference only
|
||||
src1->type == GGML_TYPE_F32 && // src1 must be float32
|
||||
has_amx_kernels && // with amx kernel impls
|
||||
ne0 % (TILE_N * 2) == 0; // out_features is 32x
|
||||
@ -421,9 +418,18 @@ ggml_backend_reg_t ggml_backend_amx_reg(void) {
|
||||
|
||||
#else // if defined(__AMX_INT8__)
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_amx(ggml_backend_t backend) {
|
||||
GGML_UNUSED(backend);
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_amx_init(void) {
|
||||
fprintf(stderr, "GGML is not compiled with AMX support!\n");
|
||||
return ggml_backend_t{};
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
|
||||
@ -433,4 +439,8 @@ void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
|
||||
GGML_UNUSED(n_threads);
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_amx_reg(void) {
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
#endif
|
@ -496,19 +496,20 @@ inline void from_float(const float * x, char * vy, int64_t k);
|
||||
|
||||
template <>
|
||||
inline void from_float<block_q8_0>(const float * x, char * vy, int64_t k) {
|
||||
quantize_row_q8_0(x, vy, k);
|
||||
// FIXME: using unoptimized reference impl until moved to CPU backend
|
||||
quantize_row_q8_0_ref(x, (block_q8_0 *)vy, k);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void from_float<block_q8_1>(const float * x, char * vy, int64_t k) {
|
||||
quantize_row_q8_1(x, vy, k);
|
||||
quantize_row_q8_1_ref(x, (block_q8_1 *)vy, k);
|
||||
}
|
||||
|
||||
template <>
|
||||
inline void from_float<block_q8_K>(const float * x, char * vy, int64_t k) {
|
||||
#if 1
|
||||
// TODO: this is reference impl!
|
||||
quantize_row_q8_K(x, vy, k);
|
||||
quantize_row_q8_K_ref(x, (block_q8_K *)vy, k);
|
||||
#else
|
||||
quantize_row_q8_K_vnni(x, vy, k);
|
||||
#endif
|
||||
|
195
ggml/src/ggml-backend-reg.cpp
Normal file
195
ggml/src/ggml-backend-reg.cpp
Normal file
@ -0,0 +1,195 @@
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <cstring>
|
||||
#include <vector>
|
||||
|
||||
// Backend registry
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
#include "ggml-sycl.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_BLAS
|
||||
#include "ggml-blas.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_RPC
|
||||
#include "ggml-rpc.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_AMX
|
||||
# include "ggml-amx.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
struct ggml_backend_registry {
|
||||
std::vector<ggml_backend_reg_t> backends;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
ggml_backend_registry() {
|
||||
#ifdef GGML_USE_CUDA
|
||||
register_backend(ggml_backend_cuda_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_METAL
|
||||
register_backend(ggml_backend_metal_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_SYCL
|
||||
register_backend(ggml_backend_sycl_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_VULKAN
|
||||
register_backend(ggml_backend_vk_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CANN
|
||||
register_backend(ggml_backend_cann_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_BLAS
|
||||
register_backend(ggml_backend_blas_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_AMX
|
||||
register_backend(ggml_backend_amx_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
register_backend(ggml_backend_kompute_reg());
|
||||
#endif
|
||||
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
}
|
||||
|
||||
void register_backend(ggml_backend_reg_t reg) {
|
||||
if (!reg) {
|
||||
return;
|
||||
}
|
||||
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||
#endif
|
||||
backends.push_back(reg);
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||
register_device(ggml_backend_reg_dev_get(reg, i));
|
||||
}
|
||||
}
|
||||
|
||||
void register_device(ggml_backend_dev_t device) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
||||
#endif
|
||||
devices.push_back(device);
|
||||
}
|
||||
};
|
||||
|
||||
static ggml_backend_registry & get_reg() {
|
||||
static ggml_backend_registry reg;
|
||||
return reg;
|
||||
}
|
||||
|
||||
// Internal API
|
||||
void ggml_backend_register(ggml_backend_reg_t reg) {
|
||||
get_reg().register_backend(reg);
|
||||
}
|
||||
|
||||
void ggml_backend_device_register(ggml_backend_dev_t device) {
|
||||
get_reg().register_device(device);
|
||||
}
|
||||
|
||||
// Backend (reg) enumeration
|
||||
size_t ggml_backend_reg_count() {
|
||||
return get_reg().backends.size();
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_reg_count());
|
||||
return get_reg().backends[index];
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
|
||||
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
||||
if (std::strcmp(ggml_backend_reg_name(reg), name) == 0) {
|
||||
return reg;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Device enumeration
|
||||
size_t ggml_backend_dev_count() {
|
||||
return get_reg().devices.size();
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_dev_count());
|
||||
return get_reg().devices[index];
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
|
||||
return dev;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == type) {
|
||||
return dev;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Convenience functions
|
||||
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, params);
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, params);
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_init_best(void) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
||||
if (!dev) {
|
||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
}
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, NULL);
|
||||
}
|
@ -279,7 +279,7 @@ void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, siz
|
||||
buf->iface.get_tensor(buf, tensor, data, offset, size);
|
||||
}
|
||||
|
||||
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
|
||||
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
|
||||
if (size == 0) {
|
||||
@ -525,197 +525,6 @@ void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * na
|
||||
return reg->iface.get_proc_address(reg, name);
|
||||
}
|
||||
|
||||
// Backend registry
|
||||
|
||||
#ifdef GGML_USE_CUDA
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
#include "ggml-metal.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_SYCL
|
||||
#include "ggml-sycl.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_VULKAN
|
||||
#include "ggml-vulkan.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_BLAS
|
||||
#include "ggml-blas.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_RPC
|
||||
#include "ggml-rpc.h"
|
||||
#endif
|
||||
|
||||
#ifndef __AMX_INT8__
|
||||
#undef GGML_USE_AMX
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_AMX
|
||||
# include "ggml-amx.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CANN
|
||||
#include "ggml-cann.h"
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
#include "ggml-kompute.h"
|
||||
#endif
|
||||
|
||||
#include "ggml-cpu.h"
|
||||
|
||||
struct ggml_backend_registry {
|
||||
std::vector<ggml_backend_reg_t> backends;
|
||||
std::vector<ggml_backend_dev_t> devices;
|
||||
|
||||
ggml_backend_registry() {
|
||||
#ifdef GGML_USE_CUDA
|
||||
register_backend(ggml_backend_cuda_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_METAL
|
||||
register_backend(ggml_backend_metal_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_SYCL
|
||||
register_backend(ggml_backend_sycl_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_VULKAN
|
||||
register_backend(ggml_backend_vk_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_CANN
|
||||
register_backend(ggml_backend_cann_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_BLAS
|
||||
register_backend(ggml_backend_blas_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_RPC
|
||||
register_backend(ggml_backend_rpc_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_AMX
|
||||
register_backend(ggml_backend_amx_reg());
|
||||
#endif
|
||||
#ifdef GGML_USE_KOMPUTE
|
||||
register_backend(ggml_backend_kompute_reg());
|
||||
#endif
|
||||
|
||||
register_backend(ggml_backend_cpu_reg());
|
||||
}
|
||||
|
||||
void register_backend(ggml_backend_reg_t reg) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
|
||||
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
|
||||
#endif
|
||||
backends.push_back(reg);
|
||||
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
|
||||
register_device(ggml_backend_reg_dev_get(reg, i));
|
||||
}
|
||||
}
|
||||
|
||||
void register_device(ggml_backend_dev_t device) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
|
||||
#endif
|
||||
devices.push_back(device);
|
||||
}
|
||||
};
|
||||
|
||||
static ggml_backend_registry & get_reg() {
|
||||
static ggml_backend_registry reg;
|
||||
return reg;
|
||||
}
|
||||
|
||||
// Internal API
|
||||
void ggml_backend_register(ggml_backend_reg_t reg) {
|
||||
get_reg().register_backend(reg);
|
||||
}
|
||||
|
||||
void ggml_backend_device_register(ggml_backend_dev_t device) {
|
||||
get_reg().register_device(device);
|
||||
}
|
||||
|
||||
// Backend (reg) enumeration
|
||||
size_t ggml_backend_reg_count() {
|
||||
return get_reg().backends.size();
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_reg_count());
|
||||
return get_reg().backends[index];
|
||||
}
|
||||
|
||||
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
|
||||
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
|
||||
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
|
||||
if (strcmp(ggml_backend_reg_name(reg), name) == 0) {
|
||||
return reg;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Device enumeration
|
||||
size_t ggml_backend_dev_count() {
|
||||
return get_reg().devices.size();
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
|
||||
GGML_ASSERT(index < ggml_backend_dev_count());
|
||||
return get_reg().devices[index];
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (strcmp(ggml_backend_dev_name(dev), name) == 0) {
|
||||
return dev;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
|
||||
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
|
||||
if (ggml_backend_dev_type(dev) == type) {
|
||||
return dev;
|
||||
}
|
||||
}
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Convenience functions
|
||||
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, params);
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, params);
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_init_best(void) {
|
||||
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
|
||||
if (!dev) {
|
||||
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
}
|
||||
if (!dev) {
|
||||
return NULL;
|
||||
}
|
||||
return ggml_backend_dev_init(dev, NULL);
|
||||
}
|
||||
|
||||
// multi-buffer buffer
|
||||
|
||||
struct ggml_backend_multi_buffer_context {
|
||||
@ -880,7 +689,7 @@ static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backen
|
||||
}
|
||||
|
||||
static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
|
||||
ggml_backend_buffer_t buffer = tensor->buffer;
|
||||
ggml_backend_buffer_t buffer = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
|
||||
if (buffer == NULL) {
|
||||
return -1;
|
||||
}
|
||||
@ -913,8 +722,6 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML
|
||||
|
||||
// returns the backend that should be used for the node based on the current locations
|
||||
static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
|
||||
// TODO: use supports_op to check if the backend supports the op
|
||||
|
||||
// assign pre-allocated nodes to their backend
|
||||
int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
|
||||
if (cur_backend_id != -1) {
|
||||
@ -933,7 +740,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
||||
|
||||
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
||||
// since the tensor is pre-allocated, it cannot be moved to another backend
|
||||
GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
||||
GGML_ABORT("pre-allocated tensor (%s) in a backend that cannot run the operation", tensor->name);
|
||||
}
|
||||
|
||||
// graph input
|
||||
@ -1640,7 +1447,7 @@ ggml_backend_sched_t ggml_backend_sched_new(
|
||||
bool parallel) {
|
||||
GGML_ASSERT(n_backends > 0);
|
||||
GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS);
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backends[n_backends - 1])); // last backend must be CPU
|
||||
GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU);
|
||||
|
||||
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
|
||||
|
||||
@ -1729,12 +1536,13 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph *
|
||||
|
||||
ggml_backend_sched_split_graph(sched, measure_graph);
|
||||
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
|
||||
if (!ggml_gallocr_reserve_n(sched->galloc, &sched->graph, sched->node_backend_ids, sched->leaf_backend_ids)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ggml_backend_sched_reset(sched);
|
||||
ggml_backend_sched_synchronize(sched);
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -2036,17 +1844,6 @@ bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <cctype>
|
||||
#include <string>
|
||||
|
||||
// ggml-backend interface
|
||||
|
||||
// CPU backend - buffer
|
||||
|
||||
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
@ -2120,7 +1917,9 @@ static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
|
||||
/* .reset = */ NULL,
|
||||
};
|
||||
|
||||
// CPU backend - buffer type
|
||||
// CPU backend buffer type
|
||||
|
||||
// this buffer type is defined here to make it available to all backends
|
||||
|
||||
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU";
|
||||
@ -2161,7 +1960,7 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
@ -2184,478 +1983,14 @@ static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .device = */ NULL, // FIXME ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_buffer_type;
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
|
||||
// buffer type HBM
|
||||
|
||||
#include <hbwmalloc.h>
|
||||
|
||||
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_HBM";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
hbw_free(buffer->context);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * ptr;
|
||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||
if (result != 0) {
|
||||
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||
},
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_buffer_type_hbm;
|
||||
}
|
||||
#endif
|
||||
|
||||
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
|
||||
static ggml_backend_buffer_type_t bufts[] = {
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
ggml_backend_cpu_hbm_buffer_type(),
|
||||
#endif
|
||||
NULL
|
||||
};
|
||||
|
||||
return bufts;
|
||||
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
// CPU backend - backend (stream)
|
||||
|
||||
struct ggml_backend_cpu_context {
|
||||
int n_threads;
|
||||
ggml_threadpool_t threadpool;
|
||||
|
||||
uint8_t * work_data;
|
||||
size_t work_size;
|
||||
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
delete[] cpu_ctx->work_data;
|
||||
delete cpu_ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
struct ggml_backend_plan_cpu {
|
||||
struct ggml_cplan cplan;
|
||||
struct ggml_cgraph cgraph;
|
||||
};
|
||||
|
||||
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
|
||||
|
||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||
|
||||
if (cpu_plan->cplan.work_size > 0) {
|
||||
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
||||
if (cpu_plan->cplan.work_data == NULL) {
|
||||
delete cpu_plan;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
||||
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||
|
||||
return cpu_plan;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||
|
||||
delete[] cpu_plan->cplan.work_data;
|
||||
delete cpu_plan;
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||
|
||||
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||
|
||||
if (cpu_ctx->work_size < cplan.work_size) {
|
||||
delete[] cpu_ctx->work_data;
|
||||
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
||||
if (cpu_ctx->work_data == NULL) {
|
||||
cpu_ctx->work_size = 0;
|
||||
return GGML_STATUS_ALLOC_FAILED;
|
||||
}
|
||||
cpu_ctx->work_size = cplan.work_size;
|
||||
}
|
||||
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
||||
|
||||
cplan.abort_callback = cpu_ctx->abort_callback;
|
||||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_get_name,
|
||||
/* .free = */ ggml_backend_cpu_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||||
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
||||
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
||||
return &guid;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_cpu_init(void) {
|
||||
// initialize CPU backend now to avoid slowing the first graph computation
|
||||
ggml_cpu_init();
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
|
||||
if (ctx == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||
ctx->threadpool = NULL;
|
||||
ctx->work_data = NULL;
|
||||
ctx->work_size = 0;
|
||||
ctx->abort_callback = NULL;
|
||||
ctx->abort_callback_data = NULL;
|
||||
|
||||
ggml_backend_t cpu_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .interface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
if (cpu_backend == NULL) {
|
||||
delete ctx;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return cpu_backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
ctx->n_threads = n_threads;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
|
||||
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
||||
// already had a different threadpool, pause/suspend it before switching
|
||||
ggml_threadpool_pause(ctx->threadpool);
|
||||
}
|
||||
ctx->threadpool = threadpool;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
ctx->abort_callback = abort_callback;
|
||||
ctx->abort_callback_data = abort_callback_data;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
|
||||
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
|
||||
return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
|
||||
}
|
||||
|
||||
// CPU backend - device
|
||||
|
||||
struct ggml_backend_cpu_device_context {
|
||||
std::string description = "CPU";
|
||||
|
||||
ggml_backend_cpu_device_context() {
|
||||
#ifdef __APPLE__
|
||||
size_t len = 0;
|
||||
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
||||
description.resize(len);
|
||||
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
||||
}
|
||||
#elif defined(__linux__)
|
||||
FILE * f = fopen("/proc/cpuinfo", "r");
|
||||
if (f) {
|
||||
char buf[1024];
|
||||
while (fgets(buf, sizeof(buf), f)) {
|
||||
if (strncmp(buf, "model name", 10) == 0) {
|
||||
char * p = strchr(buf, ':');
|
||||
if (p) {
|
||||
p++;
|
||||
while (std::isspace(*p)) {
|
||||
p++;
|
||||
}
|
||||
while (std::isspace(p[strlen(p) - 1])) {
|
||||
p[strlen(p) - 1] = '\0';
|
||||
}
|
||||
description = p;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
HKEY hKey;
|
||||
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
||||
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
||||
0,
|
||||
KEY_READ,
|
||||
&hKey) == ERROR_SUCCESS) {
|
||||
DWORD cpu_brand_size = 0;
|
||||
if (RegQueryValueExA(hKey,
|
||||
TEXT("ProcessorNameString"),
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||
description.resize(cpu_brand_size);
|
||||
if (RegQueryValueExA(hKey,
|
||||
TEXT("ProcessorNameString"),
|
||||
NULL,
|
||||
NULL,
|
||||
(LPBYTE)&description[0], // NOLINT
|
||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||
if (description.find('\0') != std::string::npos) {
|
||||
description.resize(description.find('\0'));
|
||||
}
|
||||
}
|
||||
}
|
||||
RegCloseKey(hKey);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
|
||||
struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
|
||||
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
// TODO
|
||||
*free = 0;
|
||||
*total = 0;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cpu_device_get_name(dev);
|
||||
props->description = ggml_backend_cpu_device_get_description(dev);
|
||||
props->type = ggml_backend_cpu_device_get_type(dev);
|
||||
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ true,
|
||||
/* .events = */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||
return ggml_backend_cpu_init();
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
switch (op->op) {
|
||||
case GGML_OP_CPY:
|
||||
return
|
||||
op->type != GGML_TYPE_IQ2_XXS &&
|
||||
op->type != GGML_TYPE_IQ2_XS &&
|
||||
op->type != GGML_TYPE_IQ1_S &&
|
||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||
case GGML_OP_MUL_MAT:
|
||||
return op->src[1]->type == GGML_TYPE_F32;// FIXME || op->src[1]->type == ggml_get_type_traits(op->src[0]->type)->vec_dot_type;
|
||||
case GGML_OP_ROPE_BACK:
|
||||
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
return op->src[0]->type == GGML_TYPE_F32 && op->src[1]->type == GGML_TYPE_F32;
|
||||
case GGML_OP_OUT_PROD:
|
||||
return (op->src[0]->type == GGML_TYPE_F32 || ggml_is_quantized(op->src[0]->type)) && op->src[1]->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return ggml_backend_buft_is_host(buft);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_device_get_name,
|
||||
/* .get_description = */ ggml_backend_cpu_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
||||
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_cpu_device_init_backend,
|
||||
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
|
||||
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
// CPU backend - backend (reg)
|
||||
|
||||
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
return 1;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
GGML_ASSERT(index == 0);
|
||||
|
||||
static ggml_backend_cpu_device_context ctx;
|
||||
static ggml_backend_device ggml_backend_cpu_device = {
|
||||
/* .iface = */ ggml_backend_cpu_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ &ctx,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_device;
|
||||
}
|
||||
|
||||
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_n_threads;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
||||
return (void *)ggml_backend_cpu_get_extra_bufts;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
|
||||
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
||||
/* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
||||
static struct ggml_backend_reg ggml_backend_cpu_reg = {
|
||||
/* .iface = */ ggml_backend_cpu_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_reg;
|
||||
}
|
||||
|
90
ggml/src/ggml-blas/CMakeLists.txt
Normal file
90
ggml/src/ggml-blas/CMakeLists.txt
Normal file
@ -0,0 +1,90 @@
|
||||
if (GGML_STATIC)
|
||||
set(BLA_STATIC ON)
|
||||
endif()
|
||||
#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
|
||||
# set(BLA_SIZEOF_INTEGER 8)
|
||||
#endif()
|
||||
|
||||
set(BLA_VENDOR ${GGML_BLAS_VENDOR})
|
||||
find_package(BLAS)
|
||||
|
||||
if (BLAS_FOUND)
|
||||
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
|
||||
|
||||
add_library(ggml-blas
|
||||
ggml-blas.cpp
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-blas PRIVATE ggml-base)
|
||||
target_include_directories(ggml-blas PRIVATE . ..)
|
||||
|
||||
if (${GGML_BLAS_VENDOR} MATCHES "Apple")
|
||||
add_compile_definitions(ACCELERATE_NEW_LAPACK)
|
||||
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
|
||||
add_compile_definitions(GGML_BLAS_USE_ACCELERATE)
|
||||
elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
|
||||
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
|
||||
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
|
||||
find_package(PkgConfig REQUIRED)
|
||||
if (${GGML_BLAS_VENDOR} MATCHES "Generic")
|
||||
pkg_check_modules(DepBLAS blas)
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
|
||||
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
|
||||
pkg_check_modules(DepBLAS openblas64)
|
||||
if (NOT DepBLAS_FOUND)
|
||||
pkg_check_modules(DepBLAS openblas)
|
||||
endif()
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
|
||||
add_compile_definitions(GGML_BLAS_USE_BLIS)
|
||||
pkg_check_modules(DepBLAS blis)
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
|
||||
pkg_check_modules(DepBLAS blas-atlas)
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
|
||||
pkg_check_modules(DepBLAS flexiblas_api)
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
|
||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||
# all Intel* libraries share the same include path
|
||||
pkg_check_modules(DepBLAS mkl-sdl)
|
||||
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
|
||||
# this doesn't provide pkg-config
|
||||
# suggest to assign BLAS_INCLUDE_DIRS on your own
|
||||
if ("${NVHPC_VERSION}" STREQUAL "")
|
||||
message(WARNING "Better to set NVHPC_VERSION")
|
||||
else()
|
||||
set(DepBLAS_FOUND ON)
|
||||
set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
|
||||
endif()
|
||||
endif()
|
||||
if (DepBLAS_FOUND)
|
||||
set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
|
||||
else()
|
||||
message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
|
||||
" detected by pkgconfig, trying to find cblas.h from possible paths...")
|
||||
find_path(BLAS_INCLUDE_DIRS
|
||||
NAMES cblas.h
|
||||
HINTS
|
||||
/usr/include
|
||||
/usr/local/include
|
||||
/usr/include/openblas
|
||||
/opt/homebrew/opt/openblas/include
|
||||
/usr/local/opt/openblas/include
|
||||
/usr/include/x86_64-linux-gnu/openblas/include
|
||||
)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
|
||||
|
||||
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
|
||||
|
||||
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
|
||||
add_compile_definitions(GGML_BLAS_USE_MKL)
|
||||
endif()
|
||||
|
||||
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
|
||||
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
|
||||
else()
|
||||
message(ERROR "BLAS not found, please refer to "
|
||||
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
|
||||
" to set correct GGML_BLAS_VENDOR")
|
||||
endif()
|
@ -6,7 +6,7 @@
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#if defined(GGML_BLAS_USE_ACCELERATE)
|
||||
# include <Accelerate/Accelerate.h>
|
||||
#elif defined(GGML_BLAS_USE_MKL)
|
||||
# include <mkl.h>
|
||||
@ -320,7 +320,7 @@ static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
|
||||
}
|
||||
|
||||
static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#if defined(GGML_BLAS_USE_ACCELERATE)
|
||||
return "Accelerate";
|
||||
#elif defined(GGML_BLAS_USE_MKL)
|
||||
return "MKL";
|
46
ggml/src/ggml-cann/CMakeLists.txt
Normal file
46
ggml/src/ggml-cann/CMakeLists.txt
Normal file
@ -0,0 +1,46 @@
|
||||
if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
|
||||
set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
|
||||
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
|
||||
endif()
|
||||
|
||||
if (CANN_INSTALL_DIR)
|
||||
# Only Support Linux.
|
||||
if (NOT UNIX)
|
||||
message(FATAL_ERROR "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}")
|
||||
endif()
|
||||
|
||||
# Supported platforms: x86-64, arm64
|
||||
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
|
||||
else()
|
||||
message(FATAL_ERROR "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
endif()
|
||||
|
||||
# Set header and libs
|
||||
set(CANN_INCLUDE_DIRS
|
||||
${CANN_INSTALL_DIR}/include
|
||||
${CANN_INSTALL_DIR}/include/aclnn
|
||||
${CANN_INSTALL_DIR}/acllib/include
|
||||
)
|
||||
|
||||
add_subdirectory(kernels)
|
||||
list(APPEND CANN_LIBRARIES
|
||||
ascendcl
|
||||
nnopbase
|
||||
opapi
|
||||
acl_op_compiler
|
||||
ascendc_kernels
|
||||
)
|
||||
|
||||
file(GLOB GGML_SOURCES_CANN "*.cpp")
|
||||
|
||||
add_library(ggml-cann ${GGML_SOURCES_CANN})
|
||||
target_link_libraries(ggml-cann PRIVATE ggml-base ${CANN_LIBRARIES})
|
||||
target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
|
||||
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
|
||||
|
||||
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
|
||||
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
|
||||
else()
|
||||
message(FATAL_ERROR "CANN: Can't find CANN_INSTALL_DIR, did you forget to source set_var.sh?")
|
||||
endif()
|
261
ggml/src/ggml-cpu/CMakeLists.txt
Normal file
261
ggml/src/ggml-cpu/CMakeLists.txt
Normal file
@ -0,0 +1,261 @@
|
||||
add_library(ggml-cpu
|
||||
ggml-cpu.c
|
||||
ggml-cpu.cpp
|
||||
ggml-cpu-aarch64.c
|
||||
ggml-cpu-aarch64.h
|
||||
ggml-cpu-quants.c
|
||||
ggml-cpu-quants.h
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-cpu PRIVATE ggml-base)
|
||||
target_include_directories(ggml-cpu PRIVATE . ..)
|
||||
|
||||
if (APPLE AND GGML_ACCELERATE)
|
||||
find_library(ACCELERATE_FRAMEWORK Accelerate)
|
||||
if (ACCELERATE_FRAMEWORK)
|
||||
message(STATUS "Accelerate framework found")
|
||||
|
||||
add_compile_definitions(GGML_USE_ACCELERATE)
|
||||
add_compile_definitions(ACCELERATE_NEW_LAPACK)
|
||||
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
|
||||
|
||||
target_link_libraries(ggml-cpu PRIVATE ${ACCELERATE_FRAMEWORK})
|
||||
else()
|
||||
message(WARNING "Accelerate framework not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_OPENMP)
|
||||
find_package(OpenMP)
|
||||
if (OpenMP_FOUND)
|
||||
message(STATUS "OpenMP found")
|
||||
|
||||
add_compile_definitions(GGML_USE_OPENMP)
|
||||
|
||||
target_link_libraries(ggml-cpu PRIVATE OpenMP::OpenMP_C OpenMP::OpenMP_CXX)
|
||||
|
||||
# FIXME: should be replaced with a compiler id check
|
||||
#if (GGML_MUSA)
|
||||
# list(APPEND GGML_CPU_EXTRA_INCLUDES "/usr/lib/llvm-14/lib/clang/14.0.0/include")
|
||||
# list(APPEND GGML_CPU_EXTRA_LIBS_PRIVATE "/usr/lib/llvm-14/lib/libomp.so")
|
||||
#endif()
|
||||
else()
|
||||
message(WARNING "OpenMP not found")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_LLAMAFILE)
|
||||
message(STATUS "Using llamafile")
|
||||
|
||||
add_compile_definitions(GGML_USE_LLAMAFILE)
|
||||
|
||||
target_sources(ggml-cpu PRIVATE
|
||||
llamafile/sgemm.cpp
|
||||
llamafile/sgemm.h)
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_HBM)
|
||||
find_library(memkind memkind REQUIRED)
|
||||
|
||||
message(STATUS "Using memkind for CPU HBM")
|
||||
|
||||
add_compile_definitions(GGML_USE_CPU_HBM)
|
||||
|
||||
target_link_libraries(ggml-cpu PUBLIC memkind)
|
||||
endif()
|
||||
|
||||
if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR
|
||||
CMAKE_GENERATOR_PLATFORM_LWR STREQUAL "arm64" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND
|
||||
NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
|
||||
|
||||
message(STATUS "ARM detected")
|
||||
|
||||
if (MSVC)
|
||||
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
|
||||
add_compile_definitions(__ARM_NEON)
|
||||
add_compile_definitions(__ARM_FEATURE_FMA)
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
|
||||
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
if (GGML_COMPILER_SUPPORT_DOTPROD)
|
||||
add_compile_definitions(__ARM_FEATURE_DOTPROD)
|
||||
endif ()
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vmlaq_f32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
|
||||
if (GGML_COMPILER_SUPPORT_MATMUL_INT8)
|
||||
add_compile_definitions(__ARM_FEATURE_MATMUL_INT8)
|
||||
endif ()
|
||||
|
||||
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
|
||||
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
endif ()
|
||||
|
||||
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
|
||||
else()
|
||||
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
|
||||
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")
|
||||
list(APPEND ARCH_FLAGS -mfp16-format=ieee)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv6")
|
||||
# Raspberry Pi 1, Zero
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access)
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv7")
|
||||
if ("${CMAKE_SYSTEM_NAME}" STREQUAL "Android")
|
||||
# Android armeabi-v7a
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-vfpv4 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
else()
|
||||
# Raspberry Pi 2
|
||||
list(APPEND ARCH_FLAGS -mfpu=neon-fp-armv8 -mno-unaligned-access -funsafe-math-optimizations)
|
||||
endif()
|
||||
endif()
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "armv8")
|
||||
# Android arm64-v8a
|
||||
# Raspberry Pi 3, 4, Zero 2 (32-bit)
|
||||
list(APPEND ARCH_FLAGS -mno-unaligned-access)
|
||||
endif()
|
||||
if (GGML_SVE)
|
||||
list(APPEND ARCH_FLAGS -march=armv8.6-a+sve)
|
||||
endif()
|
||||
endif()
|
||||
elseif (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
|
||||
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
|
||||
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
|
||||
message(STATUS "x86 detected")
|
||||
if (MSVC)
|
||||
# instruction set detection for MSVC only
|
||||
if (GGML_NATIVE)
|
||||
# TODO: improve, should not reference files from the parent folder
|
||||
include(cmake/FindSIMD.cmake)
|
||||
endif ()
|
||||
if (GGML_AVX512)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX512)
|
||||
# MSVC has no compile-time flags enabling specific
|
||||
# AVX512 extensions, neither it defines the
|
||||
# macros corresponding to the extensions.
|
||||
# Do it manually.
|
||||
if (GGML_AVX512_VBMI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AVX512_VNNI)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AVX512_BF16)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
|
||||
if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
|
||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
||||
endif()
|
||||
endif()
|
||||
if (GGML_AMX_TILE)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
|
||||
endif()
|
||||
if (GGML_AMX_INT8)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
|
||||
endif()
|
||||
if (GGML_AMX_BF16)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
|
||||
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
|
||||
endif()
|
||||
elseif (GGML_AVX2)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX2)
|
||||
elseif (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS /arch:AVX)
|
||||
endif()
|
||||
else()
|
||||
if (GGML_NATIVE)
|
||||
list(APPEND ARCH_FLAGS -march=native)
|
||||
endif()
|
||||
if (GGML_F16C)
|
||||
list(APPEND ARCH_FLAGS -mf16c)
|
||||
endif()
|
||||
if (GGML_FMA)
|
||||
list(APPEND ARCH_FLAGS -mfma)
|
||||
endif()
|
||||
if (GGML_AVX)
|
||||
list(APPEND ARCH_FLAGS -mavx)
|
||||
endif()
|
||||
if (GGML_AVX2)
|
||||
list(APPEND ARCH_FLAGS -mavx2)
|
||||
endif()
|
||||
if (GGML_AVX512)
|
||||
list(APPEND ARCH_FLAGS -mavx512f)
|
||||
list(APPEND ARCH_FLAGS -mavx512dq)
|
||||
list(APPEND ARCH_FLAGS -mavx512bw)
|
||||
endif()
|
||||
if (GGML_AVX512_VBMI)
|
||||
list(APPEND ARCH_FLAGS -mavx512vbmi)
|
||||
endif()
|
||||
if (GGML_AVX512_VNNI)
|
||||
list(APPEND ARCH_FLAGS -mavx512vnni)
|
||||
endif()
|
||||
if (GGML_AVX512_BF16)
|
||||
list(APPEND ARCH_FLAGS -mavx512bf16)
|
||||
endif()
|
||||
if (GGML_AMX_TILE)
|
||||
list(APPEND ARCH_FLAGS -mamx-tile)
|
||||
endif()
|
||||
if (GGML_AMX_INT8)
|
||||
list(APPEND ARCH_FLAGS -mamx-int8)
|
||||
endif()
|
||||
if (GGML_AMX_BF16)
|
||||
list(APPEND ARCH_FLAGS -mamx-bf16)
|
||||
endif()
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
execute_process(COMMAND bash -c "grep POWER10 /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
string(FIND "${POWER10_M}" "POWER10" substring_index)
|
||||
if (NOT DEFINED substring_index OR "${substring_index}" STREQUAL "")
|
||||
set(substring_index -1)
|
||||
endif()
|
||||
|
||||
if (${substring_index} GREATER_EQUAL 0)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native)
|
||||
#TODO: Add targets for Power8/Power9 (Altivec/VSX) and Power10(MMA) and query for big endian systems (ppc64/le/be)
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
message(STATUS "loongarch64 detected")
|
||||
|
||||
list(APPEND ARCH_FLAGS -march=loongarch64)
|
||||
if (GGML_LASX)
|
||||
list(APPEND ARCH_FLAGS -mlasx)
|
||||
endif()
|
||||
if (GGML_LSX)
|
||||
list(APPEND ARCH_FLAGS -mlsx)
|
||||
endif()
|
||||
else()
|
||||
message(STATUS "Unknown architecture")
|
||||
endif()
|
||||
|
||||
if (GGML_CPU_AARCH64)
|
||||
message(STATUS "Using runtime weight conversion of Q4_0 to Q4_0_x_x to enable optimized GEMM/GEMV kernels")
|
||||
add_compile_definitions(GGML_USE_CPU_AARCH64)
|
||||
endif()
|
||||
|
||||
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
|
||||
target_compile_options(ggml-cpu PRIVATE "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")
|
||||
|
||||
if (EMSCRIPTEN)
|
||||
set_target_properties(ggml-cpu PROPERTIES COMPILE_FLAGS "-msimd128")
|
||||
endif()
|
3560
ggml/src/ggml-cpu/ggml-cpu-aarch64.c
Normal file
3560
ggml/src/ggml-cpu/ggml-cpu-aarch64.c
Normal file
File diff suppressed because it is too large
Load Diff
30
ggml/src/ggml-cpu/ggml-cpu-aarch64.h
Normal file
30
ggml/src/ggml-cpu/ggml-cpu-aarch64.h
Normal file
@ -0,0 +1,30 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML internal header
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
|
||||
|
||||
// GEMV
|
||||
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
// GEMM
|
||||
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
|
||||
|
||||
void ggml_aarch64_repack_tensor(struct ggml_tensor * cur, enum ggml_type repack_type, const void * data, size_t data_size);
|
||||
enum ggml_type ggml_aarch64_get_optimal_repack_type(const struct ggml_tensor * cur);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
@ -27,80 +27,6 @@ extern "C" {
|
||||
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Converts brain16 to float32.
|
||||
*
|
||||
* The bfloat16 floating point format has the following structure:
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───┐
|
||||
* 0b0000000000000000 brain16
|
||||
*
|
||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||
* encoding and decoding numbers becomes relatively straightforward.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───────────────────┐
|
||||
* 0b00000000000000000000000000000000 IEEE binary32
|
||||
*
|
||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌─┴─┐┌─┴──────┐
|
||||
* 0b0000000000000000 IEEE binary16
|
||||
*
|
||||
* @see IEEE 754-2008
|
||||
*/
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This is binary identical with Google Brain float conversion.
|
||||
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
ggml_bf16_t h;
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.f = s;
|
||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
return h;
|
||||
}
|
||||
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
||||
#if defined(_MSC_VER) && (defined(__AVX2__) || defined(__AVX512F__))
|
||||
#ifndef __FMA__
|
||||
@ -388,28 +314,6 @@ inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b)
|
||||
|
||||
#endif // defined(__ARM_NEON)
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
ggml_fp16_internal_t tmp;
|
||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
ggml_fp16_internal_t tmp = f;
|
||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||
return res;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#ifdef __wasm_simd128__
|
||||
#include <wasm_simd128.h>
|
||||
#else
|
||||
@ -462,153 +366,6 @@ static __m256 __lasx_xvreplfr2vr_s(float val) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef __F16C__
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
#else
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#endif
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
// ref: https://github.com/Maratyszcza/FP16
|
||||
|
||||
static inline float fp32_from_bits(uint32_t w) {
|
||||
union {
|
||||
uint32_t as_bits;
|
||||
float as_value;
|
||||
} fp32;
|
||||
fp32.as_bits = w;
|
||||
return fp32.as_value;
|
||||
}
|
||||
|
||||
static inline uint32_t fp32_to_bits(float f) {
|
||||
union {
|
||||
float as_value;
|
||||
uint32_t as_bits;
|
||||
} fp32;
|
||||
fp32.as_value = f;
|
||||
return fp32.as_bits;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
const uint32_t two_w = w + w;
|
||||
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float exp_scale = 0x1.0p-112f;
|
||||
#else
|
||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||
#endif
|
||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
|
||||
const float scale_to_inf = 0x1.0p+112f;
|
||||
const float scale_to_zero = 0x1.0p-110f;
|
||||
#else
|
||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||
#endif
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#endif // __F16C__
|
||||
|
||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
extern float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32)
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return ggml_table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_FP32_TO_FP16)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
10822
ggml/src/ggml-cpu/ggml-cpu-quants.c
Normal file
10822
ggml/src/ggml-cpu/ggml-cpu-quants.c
Normal file
File diff suppressed because it is too large
Load Diff
63
ggml/src/ggml-cpu/ggml-cpu-quants.h
Normal file
63
ggml/src/ggml-cpu/ggml-cpu-quants.h
Normal file
@ -0,0 +1,63 @@
|
||||
#pragma once
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
#include "ggml-common.h"
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
// GGML CPU internal header
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Quantization
|
||||
void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_tq1_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_tq2_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
void quantize_row_iq4_nl (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
void quantize_row_iq4_xs (const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
|
||||
|
||||
// Dot product
|
||||
void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_tq1_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_tq2_0_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
void ggml_vec_dot_iq2_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq2_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_xxs_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq1_m_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_nl_q8_0 (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq4_xs_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, size_t bx, const void * GGML_RESTRICT vy, size_t by, int nrc);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -1,13 +1,15 @@
|
||||
#define _CRT_SECURE_NO_DEPRECATE // Disables "unsafe" warnings on Windows
|
||||
#define _USE_MATH_DEFINES // For M_PI on MSVC
|
||||
|
||||
#include "ggml-aarch64.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-cpu-aarch64.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml-cpu-quants.h"
|
||||
#include "ggml-threading.h"
|
||||
#include "ggml.h"
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
@ -42,7 +44,7 @@
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_LLAMAFILE
|
||||
#include <llamafile/sgemm.h>
|
||||
#include "llamafile/sgemm.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
@ -104,9 +106,6 @@ static ggml_fp16_t ggml_table_gelu_f16[1 << 16];
|
||||
// precomputed quick gelu table for f16 (128 KB)
|
||||
static ggml_fp16_t ggml_table_gelu_quick_f16[1 << 16];
|
||||
|
||||
// precomputed f32 table for f16 (256 KB) (ggml-impl.h)
|
||||
float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
#if defined(__ARM_ARCH)
|
||||
struct ggml_arm_arch_features_type {
|
||||
int has_neon;
|
||||
@ -261,11 +260,13 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_F16] = {
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_fp16_row,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_f16,
|
||||
.vec_dot_type = GGML_TYPE_F16,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_0] = {
|
||||
.from_float = quantize_row_q4_0,
|
||||
.vec_dot = ggml_vec_dot_q4_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
@ -275,6 +276,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
#endif
|
||||
},
|
||||
[GGML_TYPE_Q4_1] = {
|
||||
.from_float = quantize_row_q4_1,
|
||||
.vec_dot = ggml_vec_dot_q4_1_q8_1,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
#if defined (__ARM_FEATURE_MATMUL_INT8)
|
||||
@ -283,27 +285,20 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.nrows = 1,
|
||||
#endif
|
||||
},
|
||||
[4] = { // GGML_TYPE_Q4_2
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_COUNT,
|
||||
.nrows = 1,
|
||||
},
|
||||
[5] = { // GGML_TYPE_Q4_3
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_COUNT,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q5_0] = {
|
||||
.from_float = quantize_row_q5_0,
|
||||
.vec_dot = ggml_vec_dot_q5_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q5_1] = {
|
||||
.from_float = quantize_row_q5_1,
|
||||
.vec_dot = ggml_vec_dot_q5_1_q8_1,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q8_0] = {
|
||||
.from_float = quantize_row_q8_0,
|
||||
.from_float_to_mat = quantize_mat_q8_0,
|
||||
.vec_dot = ggml_vec_dot_q8_0_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
@ -314,85 +309,106 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
#endif
|
||||
},
|
||||
[GGML_TYPE_Q8_1] = {
|
||||
.from_float = quantize_row_q8_1,
|
||||
.vec_dot_type = GGML_TYPE_Q8_1,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q2_K] = {
|
||||
.from_float = quantize_row_q2_K,
|
||||
.vec_dot = ggml_vec_dot_q2_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q3_K] = {
|
||||
.from_float = quantize_row_q3_K,
|
||||
.vec_dot = ggml_vec_dot_q3_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_K] = {
|
||||
.from_float = quantize_row_q4_K,
|
||||
.vec_dot = ggml_vec_dot_q4_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q5_K] = {
|
||||
.from_float = quantize_row_q5_K,
|
||||
.vec_dot = ggml_vec_dot_q5_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q6_K] = {
|
||||
.from_float = quantize_row_q6_K,
|
||||
.vec_dot = ggml_vec_dot_q6_K_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ2_XXS] = {
|
||||
.from_float = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq2_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ2_XS] = {
|
||||
.from_float = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq2_xs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ3_XXS] = {
|
||||
// NOTE: from_float for iq3 and iq2_s was removed because these quants require initialization in ggml_quantize_init
|
||||
//.from_float = quantize_row_iq3_xxs,
|
||||
.vec_dot = ggml_vec_dot_iq3_xxs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ3_S] = {
|
||||
//.from_float = quantize_row_iq3_s,
|
||||
.vec_dot = ggml_vec_dot_iq3_s_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ2_S] = {
|
||||
//.from_float = quantize_row_iq2_s,
|
||||
.vec_dot = ggml_vec_dot_iq2_s_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ1_S] = {
|
||||
.from_float = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq1_s_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ1_M] = {
|
||||
.from_float = NULL,
|
||||
.vec_dot = ggml_vec_dot_iq1_m_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ4_NL] = {
|
||||
.from_float = quantize_row_iq4_nl,
|
||||
.vec_dot = ggml_vec_dot_iq4_nl_q8_0,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_IQ4_XS] = {
|
||||
.from_float = quantize_row_iq4_xs,
|
||||
.vec_dot = ggml_vec_dot_iq4_xs_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q8_K] = {
|
||||
.from_float = quantize_row_q8_K,
|
||||
},
|
||||
[GGML_TYPE_BF16] = {
|
||||
.from_float = (ggml_from_float_t) ggml_fp32_to_bf16_row,
|
||||
.vec_dot = (ggml_vec_dot_t) ggml_vec_dot_bf16,
|
||||
.vec_dot_type = GGML_TYPE_BF16,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_4] = {
|
||||
.from_float = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
@ -401,6 +417,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.gemm = ggml_gemm_q4_0_4x4_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_4_8] = {
|
||||
.from_float = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
@ -409,6 +426,7 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.gemm = ggml_gemm_q4_0_4x8_q8_0,
|
||||
},
|
||||
[GGML_TYPE_Q4_0_8_8] = {
|
||||
.from_float = NULL,
|
||||
.vec_dot = NULL,
|
||||
.vec_dot_type = GGML_TYPE_Q8_0,
|
||||
.nrows = 1,
|
||||
@ -417,11 +435,13 @@ static const struct ggml_type_traits_cpu type_traits_cpu[GGML_TYPE_COUNT] = {
|
||||
.gemm = ggml_gemm_q4_0_8x8_q8_0,
|
||||
},
|
||||
[GGML_TYPE_TQ1_0] = {
|
||||
.from_float = quantize_row_tq1_0,
|
||||
.vec_dot = ggml_vec_dot_tq1_0_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
},
|
||||
[GGML_TYPE_TQ2_0] = {
|
||||
.from_float = quantize_row_tq2_0,
|
||||
.vec_dot = ggml_vec_dot_tq2_0_q8_K,
|
||||
.vec_dot_type = GGML_TYPE_Q8_K,
|
||||
.nrows = 1,
|
||||
@ -1449,8 +1469,12 @@ static void ggml_vec_dot_bf16(int n, float * restrict s, size_t bs, ggml_bf16_t
|
||||
sumf += (ggml_float)_mm512_reduce_add_ps(c2);
|
||||
|
||||
#undef LOAD
|
||||
#elif defined(__AVX2__)
|
||||
#elif defined(__AVX2__) || defined(__AVX__)
|
||||
#if defined(__AVX2__)
|
||||
#define LOAD(p) _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16))
|
||||
#else
|
||||
#define LOAD(p) _mm256_castsi256_ps(_mm256_insertf128_si256(_mm256_castsi128_si256(_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_loadu_si128((const __m128i *)(p))), 16)), (_mm_slli_epi32(_mm_cvtepu16_epi32(_mm_bsrli_si128(_mm_loadu_si128((const __m128i *)(p)), 8)), 16)), 1))
|
||||
#endif
|
||||
__m256 c1 = _mm256_setzero_ps();
|
||||
__m256 c2 = _mm256_setzero_ps();
|
||||
__m256 c3 = _mm256_setzero_ps();
|
||||
@ -2250,22 +2274,7 @@ struct ggml_state {
|
||||
struct ggml_numa_nodes numa;
|
||||
};
|
||||
|
||||
// global state
|
||||
static struct ggml_state g_state = {0};
|
||||
static atomic_flag g_state_critical = ATOMIC_FLAG_INIT;
|
||||
|
||||
// TODO: move to threading file
|
||||
// critical section via spin lock
|
||||
void ggml_critical_section_start(void) {
|
||||
while (atomic_flag_test_and_set(&g_state_critical)) {
|
||||
// spin
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_critical_section_end(void) {
|
||||
atomic_flag_clear(&g_state_critical);
|
||||
}
|
||||
|
||||
static void ggml_barrier(struct ggml_threadpool * tp) {
|
||||
int n_threads = atomic_load_explicit(&tp->n_threads_cur, memory_order_relaxed);
|
||||
@ -2360,7 +2369,7 @@ void ggml_numa_init(enum ggml_numa_strategy numa_flag) {
|
||||
// figure out which node we're on
|
||||
uint current_cpu;
|
||||
int getcpu_ret = 0;
|
||||
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 28) || defined(__COSMOPOLITAN__)
|
||||
#if __GLIBC__ > 2 || (__GLIBC__ == 2 && __GLIBC_MINOR__ > 33) || defined(__COSMOPOLITAN__)
|
||||
getcpu_ret = getcpu(¤t_cpu, &g_state.numa.current_node);
|
||||
#else
|
||||
// old glibc doesn't have a wrapper for this call. Fall back on direct syscall
|
||||
@ -2997,8 +3006,8 @@ static void ggml_compute_forward_dup_f16(
|
||||
id += ne00 * (ne01 - ir1);
|
||||
}
|
||||
}
|
||||
} else if (ggml_get_type_traits(dst->type)->from_float) {
|
||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
|
||||
} else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
|
||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
|
||||
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
||||
|
||||
size_t id = 0;
|
||||
@ -3278,8 +3287,8 @@ static void ggml_compute_forward_dup_bf16(
|
||||
id += ne00 * (ne01 - ir1);
|
||||
}
|
||||
}
|
||||
} else if (ggml_get_type_traits(dst->type)->from_float) {
|
||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
|
||||
} else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
|
||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
|
||||
float * src0_f32 = (float *) params->wdata + (ne00 + CACHE_LINE_SIZE_F32) * ith;
|
||||
|
||||
size_t id = 0;
|
||||
@ -3594,8 +3603,8 @@ static void ggml_compute_forward_dup_f32(
|
||||
id += rs * (ne01 - ir1);
|
||||
}
|
||||
}
|
||||
} else if (ggml_get_type_traits(dst->type)->from_float) {
|
||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dst->type)->from_float;
|
||||
} else if (ggml_get_type_traits_cpu(dst->type)->from_float) {
|
||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dst->type)->from_float;
|
||||
|
||||
size_t id = 0;
|
||||
size_t rs = nb0 * (ne00 / ggml_blck_size(dst->type));
|
||||
@ -4377,7 +4386,7 @@ static void ggml_compute_forward_add_q_f32(
|
||||
const enum ggml_type type = src0->type;
|
||||
const enum ggml_type dtype = dst->type;
|
||||
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
|
||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(dtype)->from_float;
|
||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(dtype)->from_float;
|
||||
|
||||
// we don't support permuted src0 or src1
|
||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||
@ -4679,7 +4688,7 @@ static void ggml_compute_forward_add1_q_f32(
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
ggml_to_float_t const dequantize_row_q = ggml_get_type_traits(type)->to_float;
|
||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits(type)->from_float;
|
||||
ggml_from_float_t const quantize_row_q = ggml_get_type_traits_cpu(type)->from_float;
|
||||
|
||||
// we don't support permuted src0
|
||||
GGML_ASSERT(nb00 == ggml_type_size(type));
|
||||
@ -7325,6 +7334,7 @@ static void ggml_compute_forward_group_norm(
|
||||
static void ggml_compute_forward_mul_mat_one_chunk(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const enum ggml_type type,
|
||||
const int64_t num_rows_per_vec_dot,
|
||||
const int64_t ir0_start,
|
||||
const int64_t ir0_end,
|
||||
@ -7336,8 +7346,6 @@ static void ggml_compute_forward_mul_mat_one_chunk(
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
|
||||
const bool src1_cont = ggml_is_contiguous(src1);
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||
@ -7425,10 +7433,14 @@ static void ggml_compute_forward_mul_mat(
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const enum ggml_type type = src0->type;
|
||||
enum ggml_type type = src0->type;
|
||||
|
||||
if (src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
|
||||
type = (enum ggml_type)(intptr_t)src0->extra;
|
||||
}
|
||||
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float;
|
||||
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||
ggml_from_float_to_mat_t const from_float_to_mat = type_traits_cpu[vec_dot_type].from_float_to_mat;
|
||||
int64_t const vec_dot_num_rows = type_traits_cpu[type].nrows;
|
||||
int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
|
||||
@ -7464,15 +7476,15 @@ static void ggml_compute_forward_mul_mat(
|
||||
if (src1_cont) {
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(src0->type),
|
||||
nb01/ggml_type_size(type),
|
||||
(const char *)src1->data + i12*nb12 + i13*nb13,
|
||||
nb11/ggml_type_size(src1->type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
ith, nth,
|
||||
src0->type,
|
||||
type,
|
||||
src1->type,
|
||||
dst->type))
|
||||
goto UseGgmlGemm1;
|
||||
@ -7525,15 +7537,15 @@ UseGgmlGemm1:;
|
||||
|
||||
for (int64_t i13 = 0; i13 < ne13; i13++)
|
||||
for (int64_t i12 = 0; i12 < ne12; i12++)
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
||||
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(type),
|
||||
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
||||
nb01/ggml_type_size(src0->type),
|
||||
nb01/ggml_type_size(type),
|
||||
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
||||
row_size/ggml_type_size(vec_dot_type),
|
||||
(char *)dst->data + i12*nb2 + i13*nb3,
|
||||
nb1/ggml_type_size(dst->type),
|
||||
ith, nth,
|
||||
src0->type,
|
||||
type,
|
||||
vec_dot_type,
|
||||
dst->type))
|
||||
goto UseGgmlGemm2;
|
||||
@ -7618,7 +7630,7 @@ UseGgmlGemm2:;
|
||||
const int64_t ir1_start = dr1 * ith1;
|
||||
const int64_t ir1_end = MIN(ir1_start + dr1, nr1);
|
||||
|
||||
ggml_compute_forward_mul_mat_one_chunk(params, dst, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
||||
ggml_compute_forward_mul_mat_one_chunk(params, dst, type, num_rows_per_vec_dot, ir0_start, ir0_end, ir1_start, ir1_end);
|
||||
|
||||
if (nth >= nchunk0 * nchunk1) {
|
||||
break;
|
||||
@ -7649,7 +7661,7 @@ static void ggml_compute_forward_mul_mat_id(
|
||||
|
||||
ggml_vec_dot_t const vec_dot = type_traits_cpu[type].vec_dot;
|
||||
enum ggml_type const vec_dot_type = type_traits_cpu[type].vec_dot_type;
|
||||
ggml_from_float_t const from_float = ggml_get_type_traits(vec_dot_type)->from_float;
|
||||
ggml_from_float_t const from_float = type_traits_cpu[vec_dot_type].from_float;
|
||||
int64_t const matmul_num_cols = type_traits_cpu[type].ncols;
|
||||
ggml_gemv_t const gemv = type_traits_cpu[type].gemv;
|
||||
|
||||
@ -9159,12 +9171,6 @@ static void rope_yarn(
|
||||
*sin_theta = sinf(theta) * mscale;
|
||||
}
|
||||
|
||||
// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
|
||||
// `corr_dim(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))`
|
||||
static float ggml_rope_yarn_corr_dim(int n_dims, int n_ctx_orig, float n_rot, float base) {
|
||||
return n_dims * logf(n_ctx_orig / (n_rot * 2 * (float)M_PI)) / (2 * logf(base));
|
||||
}
|
||||
|
||||
static void ggml_rope_cache_init(
|
||||
float theta_base, float freq_scale, const float * freq_factors, float corr_dims[2], int64_t ne0, float ext_factor, float mscale,
|
||||
float * cache, float sin_sign, float theta_scale) {
|
||||
@ -9181,16 +9187,6 @@ static void ggml_rope_cache_init(
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_rope_yarn_corr_dims(
|
||||
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]
|
||||
) {
|
||||
// start and end correction dims
|
||||
float start = floorf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_fast, freq_base));
|
||||
float end = ceilf(ggml_rope_yarn_corr_dim(n_dims, n_ctx_orig, beta_slow, freq_base));
|
||||
dims[0] = MAX(0, start);
|
||||
dims[1] = MIN(n_dims - 1, end);
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_rope_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
@ -10668,7 +10664,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
enum ggml_type const k_vec_dot_type = type_traits_cpu[k->type].vec_dot_type;
|
||||
ggml_from_float_t const q_to_vec_dot = ggml_get_type_traits(k_vec_dot_type)->from_float;
|
||||
ggml_from_float_t const q_to_vec_dot = type_traits_cpu[k_vec_dot_type].from_float;
|
||||
ggml_vec_dot_t const kq_vec_dot = type_traits_cpu[k->type].vec_dot;
|
||||
ggml_to_float_t const v_to_float = ggml_get_type_traits(v->type)->to_float;
|
||||
|
||||
@ -12220,11 +12216,16 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src0_grad = dst->src[1];
|
||||
const struct ggml_tensor * src0_grad_m = dst->src[2];
|
||||
const struct ggml_tensor * src0_grad_v = dst->src[3];
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
const struct ggml_tensor * src0_grad = dst->src[1];
|
||||
const struct ggml_tensor * src0_grad_m = dst->src[2];
|
||||
const struct ggml_tensor * src0_grad_v = dst->src[3];
|
||||
const struct ggml_tensor * adamw_params = dst->src[4];
|
||||
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
|
||||
GGML_ASSERT(ggml_nelements(adamw_params) == 7);
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
@ -12241,16 +12242,14 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
const int ir0 = dr*ith;
|
||||
const int ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
/* const float gnorm = 1.0f; */
|
||||
int64_t iter; memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
|
||||
const float alpha = ggml_get_op_params_f32(dst, 2);
|
||||
const float beta1 = ggml_get_op_params_f32(dst, 3);
|
||||
const float beta2 = ggml_get_op_params_f32(dst, 4);
|
||||
const float eps = ggml_get_op_params_f32(dst, 5);
|
||||
const float wd = ggml_get_op_params_f32(dst, 6);
|
||||
|
||||
const float beta1h = alpha/(1.0f - powf(beta1, iter));
|
||||
const float beta2h = 1.0f/(1.0f - powf(beta2, iter));
|
||||
const float * adamw_params_ptr = ggml_get_data_f32(adamw_params);
|
||||
const float alpha = adamw_params_ptr[0];
|
||||
const float beta1 = adamw_params_ptr[1];
|
||||
const float beta2 = adamw_params_ptr[2];
|
||||
const float eps = adamw_params_ptr[3];
|
||||
const float wd = adamw_params_ptr[4];
|
||||
const float beta1h = adamw_params_ptr[5];
|
||||
const float beta2h = adamw_params_ptr[6];
|
||||
|
||||
for (int ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
@ -12274,17 +12273,9 @@ static void ggml_compute_forward_opt_step_adamw_f32(
|
||||
// The weight decay is applied independently of the Adam momenta m and v.
|
||||
// This is NOT equivalent to l2 regularization that adds w[i00]*w[i00] to the loss.
|
||||
// See: https://arxiv.org/pdf/1711.05101v3.pdf
|
||||
w[i00] = w[i00]*(1.0f - alpha*wd) - mh/vh;
|
||||
w[i00] = w[i00]*(1.0f - alpha*wd) - alpha*mh/vh;
|
||||
}
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
if (ith != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
iter++;
|
||||
memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_opt_step_adamw(
|
||||
@ -13759,6 +13750,151 @@ enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct g
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
|
||||
int ggml_cpu_has_avx(void) {
|
||||
#if defined(__AVX__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx_vnni(void) {
|
||||
#if defined(__AVXVNNI__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx2(void) {
|
||||
#if defined(__AVX2__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx512(void) {
|
||||
#if defined(__AVX512F__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx512_vbmi(void) {
|
||||
#if defined(__AVX512VBMI__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx512_vnni(void) {
|
||||
#if defined(__AVX512VNNI__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_avx512_bf16(void) {
|
||||
#if defined(__AVX512BF16__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_amx_int8(void) {
|
||||
#if defined(__AMX_INT8__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_fma(void) {
|
||||
#if defined(__FMA__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_arm_fma(void) {
|
||||
#if defined(__ARM_FEATURE_FMA)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_riscv_v(void) {
|
||||
#if defined(__riscv_v_intrinsic)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_f16c(void) {
|
||||
#if defined(__F16C__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_fp16_va(void) {
|
||||
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_wasm_simd(void) {
|
||||
#if defined(__wasm_simd128__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_llamafile(void) {
|
||||
#if defined(GGML_USE_LLAMAFILE)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_sse3(void) {
|
||||
#if defined(__SSE3__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_ssse3(void) {
|
||||
#if defined(__SSSE3__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_vsx(void) {
|
||||
#if defined(__POWER9_VECTOR__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_neon(void) {
|
||||
#if defined(__ARM_ARCH)
|
||||
return ggml_arm_arch_features.has_neon;
|
663
ggml/src/ggml-cpu/ggml-cpu.cpp
Normal file
663
ggml/src/ggml-cpu/ggml-cpu.cpp
Normal file
@ -0,0 +1,663 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "ggml-backend-impl.h"
|
||||
#include "ggml-cpu.h"
|
||||
#include "ggml-cpu-aarch64.h"
|
||||
#include "ggml-impl.h"
|
||||
#include <cctype>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if defined(__APPLE__)
|
||||
#include <sys/types.h>
|
||||
#include <sys/sysctl.h>
|
||||
#endif
|
||||
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
// ggml-backend interface
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
|
||||
// buffer type HBM
|
||||
|
||||
#include <hbwmalloc.h>
|
||||
|
||||
static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_HBM";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||
hbw_free(buffer->context);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
void * ptr;
|
||||
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||
if (result != 0) {
|
||||
GGML_LOG_ERROR("failed to allocate HBM buffer of size %zu\n", size);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
buffer->buft = buft;
|
||||
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||
},
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_buffer_type_hbm;
|
||||
}
|
||||
#endif
|
||||
|
||||
// buffer type AARCH64
|
||||
|
||||
static void ggml_backend_cpu_aarch64_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||
tensor->extra = (void *)ggml_aarch64_get_optimal_repack_type(tensor); // NOLINT
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_aarch64_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||
GGML_ASSERT(offset == 0);
|
||||
GGML_ASSERT(size == ggml_nbytes(tensor));
|
||||
|
||||
enum ggml_type repack_type = (enum ggml_type)(intptr_t)tensor->extra;
|
||||
|
||||
ggml_aarch64_repack_tensor(tensor, repack_type, data, size);
|
||||
|
||||
GGML_UNUSED(buffer);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_aarch64_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
|
||||
return "CPU_AARCH64";
|
||||
|
||||
GGML_UNUSED(buft);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_aarch64_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
auto * buffer = ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
|
||||
|
||||
if (buffer == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
buffer->buft = buft;
|
||||
buffer->iface.init_tensor = ggml_backend_cpu_aarch64_buffer_init_tensor;
|
||||
buffer->iface.set_tensor = ggml_backend_cpu_aarch64_buffer_set_tensor;
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void) {
|
||||
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_aarch64 = {
|
||||
/* .iface = */ {
|
||||
/* .get_name = */ ggml_backend_cpu_aarch64_buffer_type_get_name,
|
||||
/* .alloc_buffer = */ ggml_backend_cpu_aarch64_buffer_type_alloc_buffer,
|
||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
|
||||
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
|
||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||
/* .is_host = */ NULL,
|
||||
},
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_buffer_type_aarch64;
|
||||
}
|
||||
|
||||
bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft) {
|
||||
return buft == ggml_backend_cpu_aarch64_buffer_type();
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
|
||||
static std::vector<ggml_backend_buffer_type_t> bufts = []() {
|
||||
std::vector<ggml_backend_buffer_type_t> bufts;
|
||||
|
||||
#ifdef GGML_USE_CPU_HBM
|
||||
bufts.push_back(ggml_backend_cpu_hbm_buffer_type());
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_CPU_AARCH64
|
||||
bufts.push_back(ggml_backend_cpu_aarch64_buffer_type());
|
||||
#endif
|
||||
|
||||
bufts.push_back(NULL);
|
||||
|
||||
return bufts;
|
||||
}();
|
||||
|
||||
return bufts.data();
|
||||
|
||||
GGML_UNUSED(device);
|
||||
}
|
||||
|
||||
// CPU backend - backend (stream)
|
||||
|
||||
struct ggml_backend_cpu_context {
|
||||
int n_threads;
|
||||
ggml_threadpool_t threadpool;
|
||||
|
||||
uint8_t * work_data;
|
||||
size_t work_size;
|
||||
|
||||
ggml_abort_callback abort_callback;
|
||||
void * abort_callback_data;
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_get_name(ggml_backend_t backend) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_free(ggml_backend_t backend) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
delete[] cpu_ctx->work_data;
|
||||
delete cpu_ctx;
|
||||
delete backend;
|
||||
}
|
||||
|
||||
struct ggml_backend_plan_cpu {
|
||||
struct ggml_cplan cplan;
|
||||
struct ggml_cgraph cgraph;
|
||||
};
|
||||
|
||||
static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_backend_plan_cpu * cpu_plan = new ggml_backend_plan_cpu;
|
||||
|
||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||
|
||||
if (cpu_plan->cplan.work_size > 0) {
|
||||
cpu_plan->cplan.work_data = new uint8_t[cpu_plan->cplan.work_size];
|
||||
if (cpu_plan->cplan.work_data == NULL) {
|
||||
delete cpu_plan;
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback;
|
||||
cpu_plan->cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||
|
||||
return cpu_plan;
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||
|
||||
delete[] cpu_plan->cplan.work_data;
|
||||
delete cpu_plan;
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
|
||||
struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
|
||||
|
||||
return ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
|
||||
|
||||
GGML_UNUSED(backend);
|
||||
}
|
||||
|
||||
static enum ggml_status ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||
|
||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
||||
|
||||
if (cpu_ctx->work_size < cplan.work_size) {
|
||||
delete[] cpu_ctx->work_data;
|
||||
cpu_ctx->work_data = new uint8_t[cplan.work_size];
|
||||
if (cpu_ctx->work_data == NULL) {
|
||||
cpu_ctx->work_size = 0;
|
||||
return GGML_STATUS_ALLOC_FAILED;
|
||||
}
|
||||
cpu_ctx->work_size = cplan.work_size;
|
||||
}
|
||||
cplan.work_data = (uint8_t *)cpu_ctx->work_data;
|
||||
|
||||
cplan.abort_callback = cpu_ctx->abort_callback;
|
||||
cplan.abort_callback_data = cpu_ctx->abort_callback_data;
|
||||
|
||||
return ggml_graph_compute(cgraph, &cplan);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_i ggml_backend_cpu_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_get_name,
|
||||
/* .free = */ ggml_backend_cpu_free,
|
||||
/* .set_tensor_async = */ NULL,
|
||||
/* .get_tensor_async = */ NULL,
|
||||
/* .cpy_tensor_async = */ NULL,
|
||||
/* .synchronize = */ NULL,
|
||||
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
|
||||
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
|
||||
/* .graph_plan_update = */ NULL,
|
||||
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
|
||||
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
|
||||
/* .event_record = */ NULL,
|
||||
/* .event_wait = */ NULL,
|
||||
};
|
||||
|
||||
static ggml_guid_t ggml_backend_cpu_guid(void) {
|
||||
static ggml_guid guid = { 0xaa, 0x67, 0xc7, 0x43, 0x96, 0xe6, 0xa3, 0x8a, 0xe3, 0xaf, 0xea, 0x92, 0x36, 0xbc, 0xfc, 0x89 };
|
||||
return &guid;
|
||||
}
|
||||
|
||||
ggml_backend_t ggml_backend_cpu_init(void) {
|
||||
// initialize CPU backend now to avoid slowing the first graph computation
|
||||
ggml_cpu_init();
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = new ggml_backend_cpu_context;
|
||||
if (ctx == NULL) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
ctx->n_threads = GGML_DEFAULT_N_THREADS;
|
||||
ctx->threadpool = NULL;
|
||||
ctx->work_data = NULL;
|
||||
ctx->work_size = 0;
|
||||
ctx->abort_callback = NULL;
|
||||
ctx->abort_callback_data = NULL;
|
||||
|
||||
ggml_backend_t cpu_backend = new ggml_backend {
|
||||
/* .guid = */ ggml_backend_cpu_guid(),
|
||||
/* .interface = */ ggml_backend_cpu_i,
|
||||
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
|
||||
/* .context = */ ctx,
|
||||
};
|
||||
|
||||
if (cpu_backend == NULL) {
|
||||
delete ctx;
|
||||
return NULL;
|
||||
}
|
||||
|
||||
return cpu_backend;
|
||||
}
|
||||
|
||||
bool ggml_backend_is_cpu(ggml_backend_t backend) {
|
||||
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_cpu_guid());
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
ctx->n_threads = n_threads;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_threadpool(ggml_backend_t backend_cpu, ggml_threadpool_t threadpool) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
|
||||
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
||||
// already had a different threadpool, pause/suspend it before switching
|
||||
ggml_threadpool_pause(ctx->threadpool);
|
||||
}
|
||||
ctx->threadpool = threadpool;
|
||||
}
|
||||
|
||||
void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data) {
|
||||
GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
|
||||
|
||||
struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
|
||||
ctx->abort_callback = abort_callback;
|
||||
ctx->abort_callback_data = abort_callback_data;
|
||||
}
|
||||
|
||||
// CPU backend - device
|
||||
|
||||
struct ggml_backend_cpu_device_context {
|
||||
std::string description = "CPU";
|
||||
|
||||
ggml_backend_cpu_device_context() {
|
||||
#ifdef __APPLE__
|
||||
size_t len = 0;
|
||||
if (!sysctlbyname("machdep.cpu.brand_string", NULL, &len, NULL, 0)) {
|
||||
description.resize(len);
|
||||
sysctlbyname("machdep.cpu.brand_string", &description[0], &len, NULL, 0); // NOLINT
|
||||
}
|
||||
#elif defined(__linux__)
|
||||
FILE * f = fopen("/proc/cpuinfo", "r");
|
||||
if (f) {
|
||||
char buf[1024];
|
||||
while (fgets(buf, sizeof(buf), f)) {
|
||||
if (strncmp(buf, "model name", 10) == 0) {
|
||||
char * p = strchr(buf, ':');
|
||||
if (p) {
|
||||
p++;
|
||||
while (std::isspace(*p)) {
|
||||
p++;
|
||||
}
|
||||
while (std::isspace(p[strlen(p) - 1])) {
|
||||
p[strlen(p) - 1] = '\0';
|
||||
}
|
||||
description = p;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(f);
|
||||
}
|
||||
#elif defined(_WIN32)
|
||||
HKEY hKey;
|
||||
if (RegOpenKeyEx(HKEY_LOCAL_MACHINE,
|
||||
TEXT("HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0"),
|
||||
0,
|
||||
KEY_READ,
|
||||
&hKey) == ERROR_SUCCESS) {
|
||||
DWORD cpu_brand_size = 0;
|
||||
if (RegQueryValueExA(hKey,
|
||||
TEXT("ProcessorNameString"),
|
||||
NULL,
|
||||
NULL,
|
||||
NULL,
|
||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||
description.resize(cpu_brand_size);
|
||||
if (RegQueryValueExA(hKey,
|
||||
TEXT("ProcessorNameString"),
|
||||
NULL,
|
||||
NULL,
|
||||
(LPBYTE)&description[0], // NOLINT
|
||||
&cpu_brand_size) == ERROR_SUCCESS) {
|
||||
if (description.find('\0') != std::string::npos) {
|
||||
description.resize(description.find('\0'));
|
||||
}
|
||||
}
|
||||
}
|
||||
RegCloseKey(hKey);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cpu_device_get_name(ggml_backend_dev_t dev) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t dev) {
|
||||
struct ggml_backend_cpu_device_context * ctx = (struct ggml_backend_cpu_device_context *)dev->context;
|
||||
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
// TODO
|
||||
*free = 0;
|
||||
*total = 0;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
|
||||
return GGML_BACKEND_DEVICE_TYPE_CPU;
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static void ggml_backend_cpu_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cpu_device_get_name(dev);
|
||||
props->description = ggml_backend_cpu_device_get_description(dev);
|
||||
props->type = ggml_backend_cpu_device_get_type(dev);
|
||||
ggml_backend_cpu_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
props->caps = {
|
||||
/* .async = */ false,
|
||||
/* .host_buffer = */ false,
|
||||
/* .buffer_from_host_ptr = */ true,
|
||||
/* .events = */ false,
|
||||
};
|
||||
}
|
||||
|
||||
static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
|
||||
return ggml_backend_cpu_init();
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(params);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_type_t ggml_backend_cpu_device_get_buffer_type(ggml_backend_dev_t dev) {
|
||||
return ggml_backend_cpu_buffer_type();
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
|
||||
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
GGML_UNUSED(max_tensor_size);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
|
||||
const struct ggml_tensor * src0 = op->src[0];
|
||||
const struct ggml_tensor * src1 = op->src[1];
|
||||
|
||||
if (src0 && src0->buffer && ggml_backend_cpu_buft_is_aarch64(src0->buffer->buft)) {
|
||||
if (op->op != GGML_OP_MUL_MAT || src0->type != GGML_TYPE_Q4_0 || ggml_aarch64_get_optimal_repack_type(src0) == GGML_TYPE_Q4_0) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 1; i < GGML_MAX_SRC; i++) {
|
||||
if (op->src[i] && op->src[i]->buffer && ggml_backend_cpu_buft_is_aarch64(op->src[i]->buffer->buft)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
switch (op->op) {
|
||||
case GGML_OP_CPY:
|
||||
return
|
||||
op->type != GGML_TYPE_IQ2_XXS &&
|
||||
op->type != GGML_TYPE_IQ2_XS &&
|
||||
op->type != GGML_TYPE_IQ1_S &&
|
||||
op->type != GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
||||
case GGML_OP_MUL_MAT:
|
||||
return src1->type == GGML_TYPE_F32 || src1->type == ggml_get_type_traits_cpu(src0->type)->vec_dot_type;
|
||||
case GGML_OP_ROPE_BACK:
|
||||
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
||||
case GGML_OP_IM2COL_BACK:
|
||||
return src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32;
|
||||
case GGML_OP_OUT_PROD:
|
||||
return (src0->type == GGML_TYPE_F32 || ggml_is_quantized(src0->type)) && src1->type == GGML_TYPE_F32;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static bool ggml_backend_cpu_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
|
||||
return ggml_backend_buft_is_host(buft) || ggml_backend_cpu_buft_is_aarch64(buft);
|
||||
|
||||
GGML_UNUSED(dev);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_device_i ggml_backend_cpu_device_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_device_get_name,
|
||||
/* .get_description = */ ggml_backend_cpu_device_get_description,
|
||||
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
|
||||
/* .get_type = */ ggml_backend_cpu_device_get_type,
|
||||
/* .get_props = */ ggml_backend_cpu_device_get_props,
|
||||
/* .init_backend = */ ggml_backend_cpu_device_init_backend,
|
||||
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
|
||||
/* .get_host_buffer_type = */ NULL,
|
||||
/* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
|
||||
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
|
||||
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
|
||||
/* .offload_op = */ NULL,
|
||||
/* .event_new = */ NULL,
|
||||
/* .event_free = */ NULL,
|
||||
/* .event_synchronize = */ NULL,
|
||||
};
|
||||
|
||||
// CPU backend - backend (reg)
|
||||
|
||||
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
|
||||
return "CPU";
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static size_t ggml_backend_cpu_reg_get_device_count(ggml_backend_reg_t reg) {
|
||||
return 1;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static ggml_backend_dev_t ggml_backend_cpu_reg_get_device(ggml_backend_reg_t reg, size_t index) {
|
||||
GGML_ASSERT(index == 0);
|
||||
|
||||
static ggml_backend_cpu_device_context ctx;
|
||||
static ggml_backend_device ggml_backend_cpu_device = {
|
||||
/* .iface = */ ggml_backend_cpu_device_i,
|
||||
/* .reg = */ reg,
|
||||
/* .context = */ &ctx,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_device;
|
||||
}
|
||||
|
||||
struct ggml_backend_feature {
|
||||
const char * name;
|
||||
const char * value;
|
||||
};
|
||||
|
||||
// Not used yet
|
||||
// This is intended to replace the the ggml_cpu_has_* functions when loading the CPU backend dynamically,
|
||||
// and additionally to allow other backends to expose their own list of features that applications can query using the same API.
|
||||
static ggml_backend_feature * ggml_backend_cpu_get_features(ggml_backend_reg_t reg) {
|
||||
static std::vector<ggml_backend_feature> features = []() {
|
||||
std::vector<ggml_backend_feature> features;
|
||||
if (ggml_cpu_has_sse3()) {
|
||||
features.push_back({ "SSE3", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_ssse3()) {
|
||||
features.push_back({ "SSSE3", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx()) {
|
||||
features.push_back({ "AVX", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx2()) {
|
||||
features.push_back({ "AVX2", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_f16c()) {
|
||||
features.push_back({ "F16C", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_fma()) {
|
||||
features.push_back({ "FMA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx_vnni()) {
|
||||
features.push_back({ "AVX_VNNI", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512()) {
|
||||
features.push_back({ "AVX512", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512_vbmi()) {
|
||||
features.push_back({ "AVX512_VBMI", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512_vnni()) {
|
||||
features.push_back({ "AVX512_VNNI", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_avx512_bf16()) {
|
||||
features.push_back({ "AVX512_BF16", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_amx_int8()) {
|
||||
features.push_back({ "AMX_INT8", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_neon()) {
|
||||
features.push_back({ "NEON", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_arm_fma()) {
|
||||
features.push_back({ "ARM_FMA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_fp16_va()) {
|
||||
features.push_back({ "FP16_VA", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_matmul_int8()) {
|
||||
features.push_back({ "MATMUL_INT8", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_sve()) {
|
||||
features.push_back({ "SVE", "1" });
|
||||
}
|
||||
if (ggml_cpu_get_sve_cnt() > 0) {
|
||||
static std::string sve_cnt = std::to_string(ggml_cpu_get_sve_cnt());
|
||||
features.push_back({ "SVE_CNT", sve_cnt.c_str() });
|
||||
}
|
||||
if (ggml_cpu_has_riscv_v()) {
|
||||
features.push_back({ "RISCV_V", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_vsx()) {
|
||||
features.push_back({ "VSX", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_wasm_simd()) {
|
||||
features.push_back({ "WASM_SIMD", "1" });
|
||||
}
|
||||
if (ggml_cpu_has_llamafile()) {
|
||||
features.push_back({ "LLAMAFILE", "1" });
|
||||
}
|
||||
|
||||
features.push_back({ nullptr, nullptr });
|
||||
|
||||
return features;
|
||||
}();
|
||||
|
||||
return features.data();
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static void * ggml_backend_cpu_get_proc_address(ggml_backend_reg_t reg, const char * name) {
|
||||
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
|
||||
return (void *)ggml_backend_cpu_set_n_threads;
|
||||
}
|
||||
if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
|
||||
return (void *)ggml_backend_cpu_get_extra_bufts;
|
||||
}
|
||||
|
||||
return NULL;
|
||||
|
||||
GGML_UNUSED(reg);
|
||||
}
|
||||
|
||||
static const struct ggml_backend_reg_i ggml_backend_cpu_reg_i = {
|
||||
/* .get_name = */ ggml_backend_cpu_reg_get_name,
|
||||
/* .get_device_count = */ ggml_backend_cpu_reg_get_device_count,
|
||||
/* .get_device = */ ggml_backend_cpu_reg_get_device,
|
||||
/* .get_proc_address = */ ggml_backend_cpu_get_proc_address,
|
||||
};
|
||||
|
||||
ggml_backend_reg_t ggml_backend_cpu_reg(void) {
|
||||
// init CPU feature detection
|
||||
ggml_cpu_init();
|
||||
|
||||
static struct ggml_backend_reg ggml_backend_cpu_reg = {
|
||||
/* .iface = */ ggml_backend_cpu_reg_i,
|
||||
/* .context = */ NULL,
|
||||
};
|
||||
|
||||
return &ggml_backend_cpu_reg;
|
||||
}
|
1885
ggml/src/ggml-cpu/llamafile/sgemm.cpp
Normal file
1885
ggml/src/ggml-cpu/llamafile/sgemm.cpp
Normal file
File diff suppressed because it is too large
Load Diff
155
ggml/src/ggml-cuda/CMakeLists.txt
Normal file
155
ggml/src/ggml-cuda/CMakeLists.txt
Normal file
@ -0,0 +1,155 @@
|
||||
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
||||
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
if (CUDAToolkit_FOUND)
|
||||
message(STATUS "CUDA Toolkit found")
|
||||
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
# native == GPUs available at build time
|
||||
# 52 == Maxwell, lowest CUDA 12 standard
|
||||
# 60 == P100, FP16 CUDA intrinsics
|
||||
# 61 == Pascal, __dp4a instruction (per-byte integer dot product)
|
||||
# 70 == V100, FP16 tensor cores
|
||||
# 75 == Turing, int8 tensor cores
|
||||
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "native")
|
||||
elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
||||
enable_language(CUDA)
|
||||
|
||||
file(GLOB GGML_HEADERS_CUDA "*.cuh")
|
||||
list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
|
||||
|
||||
file(GLOB GGML_SOURCES_CUDA "*.cu")
|
||||
file(GLOB SRCS "template-instances/fattn-wmma*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "template-instances/fattn-vec*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||
else()
|
||||
file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
endif()
|
||||
|
||||
add_library(ggml-cuda
|
||||
${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_CUDA}
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-cuda PRIVATE ggml-base)
|
||||
target_include_directories(ggml-cuda PRIVATE . ..)
|
||||
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (GGML_CUDA_GRAPHS)
|
||||
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
add_compile_definitions(GGML_CUDA_NO_VMM)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
||||
add_compile_definitions(GGML_CUDA_F16)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_PEER_COPY)
|
||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||
endif()
|
||||
|
||||
if (GGML_STATIC)
|
||||
if (WIN32)
|
||||
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||
else ()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
endif()
|
||||
else()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
||||
else()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
|
||||
endif()
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math)
|
||||
|
||||
if (GGML_FATAL_WARNINGS)
|
||||
list(APPEND CUDA_FLAGS -Werror all-warnings)
|
||||
endif()
|
||||
|
||||
if (GGML_ALL_WARNINGS AND NOT MSVC)
|
||||
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
|
||||
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
|
||||
list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
COMMAND ${NVCC_CMD} -Xcompiler --version
|
||||
OUTPUT_VARIABLE CUDA_CCFULLVER
|
||||
ERROR_QUIET
|
||||
)
|
||||
|
||||
if (NOT CUDA_CCFULLVER MATCHES clang)
|
||||
set(CUDA_CCID "GNU")
|
||||
execute_process(
|
||||
COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
|
||||
OUTPUT_VARIABLE CUDA_CCVER
|
||||
ERROR_QUIET
|
||||
)
|
||||
else()
|
||||
if (CUDA_CCFULLVER MATCHES Apple)
|
||||
set(CUDA_CCID "AppleClang")
|
||||
else()
|
||||
set(CUDA_CCID "Clang")
|
||||
endif()
|
||||
string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
|
||||
endif()
|
||||
|
||||
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||
|
||||
get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||
list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
|
||||
endif()
|
||||
|
||||
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
|
||||
|
||||
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
|
||||
list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
|
||||
endif()
|
||||
|
||||
target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||
else()
|
||||
message(FATAL_ERROR "CUDA Toolkit not found")
|
||||
endif()
|
@ -6,7 +6,7 @@
|
||||
#include <cstdint>
|
||||
#include <memory>
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#if defined(GGML_USE_HIP)
|
||||
#define GGML_COMMON_DECL_HIP
|
||||
#define GGML_COMMON_IMPL_HIP
|
||||
#else
|
||||
@ -26,13 +26,13 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#if defined(GGML_USE_HIP)
|
||||
#include "vendors/hip.h"
|
||||
#elif defined(GGML_USE_MUSA)
|
||||
#include "vendors/musa.h"
|
||||
#else
|
||||
#include "vendors/cuda.h"
|
||||
#endif // defined(GGML_USE_HIPBLAS)
|
||||
#endif // defined(GGML_USE_HIP)
|
||||
|
||||
#define STRINGIZE_IMPL(...) #__VA_ARGS__
|
||||
#define STRINGIZE(...) STRINGIZE_IMPL(__VA_ARGS__)
|
||||
@ -97,7 +97,7 @@ void ggml_cuda_error(const char * stmt, const char * func, const char * file, in
|
||||
|
||||
#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str)
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS)
|
||||
#if !defined(GGML_USE_HIP)
|
||||
static const char * cu_get_error_str(CUresult err) {
|
||||
const char * err_str;
|
||||
cuGetErrorString(err, &err_str);
|
||||
@ -120,21 +120,21 @@ typedef float dfloat; // dequantize float
|
||||
typedef float2 dfloat2;
|
||||
#endif // GGML_CUDA_F16
|
||||
|
||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
#define FP16_AVAILABLE
|
||||
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||
|
||||
#if defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||
#define FAST_FP16_AVAILABLE
|
||||
#endif // defined(FP16_AVAILABLE) && __CUDA_ARCH__ != 610
|
||||
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||
#define FP16_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||
#define INT8_MMA_AVAILABLE
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_TURING
|
||||
|
||||
#if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= CC_QY1)
|
||||
#define FLASH_ATTN_AVAILABLE
|
||||
@ -156,14 +156,14 @@ static constexpr bool int8_mma_available(const int cc) {
|
||||
static __device__ void no_device_code(
|
||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
||||
file_name, line, function_name, arch);
|
||||
GGML_UNUSED(arch_list);
|
||||
#else
|
||||
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
||||
file_name, line, function_name, arch, arch_list);
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
__trap();
|
||||
|
||||
GGML_UNUSED(no_device_code); // suppress unused function warning
|
||||
@ -176,7 +176,7 @@ static __device__ void no_device_code(
|
||||
#endif // __CUDA_ARCH__
|
||||
|
||||
static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||
return __reduce_add_sync(0xffffffff, x);
|
||||
#else
|
||||
#pragma unroll
|
||||
@ -184,7 +184,7 @@ static __device__ __forceinline__ int warp_reduce_sum(int x) {
|
||||
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
||||
}
|
||||
return x;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_AMPERE
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||
@ -207,7 +207,7 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
||||
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||
#ifdef FP16_AVAILABLE
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
|
||||
@ -221,7 +221,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
||||
}
|
||||
return a;
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
@ -240,11 +240,11 @@ static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||
#ifdef FP16_AVAILABLE
|
||||
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
||||
#else
|
||||
return __hmax(a, b);
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
@ -254,7 +254,7 @@ static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
|
||||
#if CUDART_VERSION >= CUDART_HMAX
|
||||
return __hmax2(a, b);
|
||||
@ -269,11 +269,11 @@ static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const hal
|
||||
GGML_UNUSED(a);
|
||||
GGML_UNUSED(b);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
}
|
||||
|
||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
#pragma unroll
|
||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||
@ -282,7 +282,7 @@ static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||
#else
|
||||
GGML_UNUSED(x);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||
}
|
||||
|
||||
#if CUDART_VERSION < CUDART_HMASK
|
||||
@ -294,7 +294,7 @@ static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half
|
||||
#endif // CUDART_VERSION < CUDART_HMASK
|
||||
|
||||
static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, int c) {
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(__gfx906__) || defined(__gfx908__) || defined(__gfx90a__) || defined(RDNA2)
|
||||
c = __builtin_amdgcn_sdot4(a, b, c, false);
|
||||
#elif defined(RDNA3)
|
||||
@ -320,7 +320,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
||||
#endif
|
||||
return c;
|
||||
|
||||
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#if __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
return __dp4a(a, b, c);
|
||||
@ -330,7 +330,7 @@ static __device__ __forceinline__ int ggml_cuda_dp4a(const int a, const int b, i
|
||||
return c + a8[0]*b8[0] + a8[1]*b8[1] + a8[2]*b8[2] + a8[3]*b8[3];
|
||||
#endif // __CUDA_ARCH__ >= MIN_CC_DP4A
|
||||
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
// TODO: move to ggml-common.h
|
||||
|
@ -517,9 +517,9 @@ constexpr __device__ dequantize_1_f32_t get_dequantize_1_f32(ggml_type type_V) {
|
||||
}
|
||||
|
||||
template<int D, int parallel_blocks> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_combine_results(
|
||||
const float * __restrict__ VKQ_parts,
|
||||
const float2 * __restrict__ VKQ_meta,
|
||||
|
@ -5,9 +5,9 @@
|
||||
#define FATTN_KQ_STRIDE_TILE_F16 64
|
||||
|
||||
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_tile_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
|
@ -5,9 +5,9 @@
|
||||
#define FATTN_KQ_STRIDE_TILE_F32 32
|
||||
|
||||
template<int D, int ncols, int nwarps, int parallel_blocks, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_tile_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
|
@ -2,9 +2,9 @@
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_vec_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
|
@ -2,9 +2,9 @@
|
||||
#include "fattn-common.cuh"
|
||||
|
||||
template<int D, int ncols, int parallel_blocks, ggml_type type_K, ggml_type type_V, bool use_logit_softcap> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_vec_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
|
@ -7,9 +7,9 @@
|
||||
|
||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t, bool use_logit_softcap>
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
|
@ -16,11 +16,11 @@
|
||||
#include "ggml-cuda/cpy.cuh"
|
||||
#include "ggml-cuda/cross-entropy-loss.cuh"
|
||||
#include "ggml-cuda/diagmask.cuh"
|
||||
#include "ggml-cuda/dmmv.cuh"
|
||||
#include "ggml-cuda/fattn.cuh"
|
||||
#include "ggml-cuda/getrows.cuh"
|
||||
#include "ggml-cuda/im2col.cuh"
|
||||
#include "ggml-cuda/mmq.cuh"
|
||||
#include "ggml-cuda/mmv.cuh"
|
||||
#include "ggml-cuda/mmvq.cuh"
|
||||
#include "ggml-cuda/norm.cuh"
|
||||
#include "ggml-cuda/opt-step-adamw.cuh"
|
||||
@ -91,7 +91,7 @@ int ggml_cuda_get_device() {
|
||||
|
||||
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
|
||||
ggml_cuda_set_device(device);
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(GGML_HIP_UMA)
|
||||
#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
|
||||
auto res = hipMallocManaged(ptr, size);
|
||||
if (res == hipSuccess) {
|
||||
// if error we "need" to know why...
|
||||
@ -100,7 +100,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||
return res;
|
||||
#else
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS)
|
||||
#if !defined(GGML_USE_HIP)
|
||||
cudaError_t err;
|
||||
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
|
||||
{
|
||||
@ -113,7 +113,7 @@ static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device)
|
||||
return err;
|
||||
#else
|
||||
return cudaMalloc(ptr, size);
|
||||
#endif // !defined(GGML_USE_HIPBLAS)
|
||||
#endif // !defined(GGML_USE_HIP)
|
||||
|
||||
#endif
|
||||
}
|
||||
@ -151,7 +151,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
int device_vmm = 0;
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
CUdevice device;
|
||||
CU_CHECK(cuDeviceGet(&device, id));
|
||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||
@ -163,7 +163,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
alloc_prop.location.id = id;
|
||||
CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
info.devices[id].vmm = !!device_vmm;
|
||||
|
||||
cudaDeviceProp prop;
|
||||
@ -175,13 +175,13 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
|
||||
info.devices[id].nsm = prop.multiProcessorCount;
|
||||
info.devices[id].smpb = prop.sharedMemPerBlock;
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlock;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
|
||||
#else
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
for (int id = 0; id < info.device_count; ++id) {
|
||||
@ -299,7 +299,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||
};
|
||||
|
||||
// pool with virtual memory
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||
|
||||
@ -393,14 +393,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||
GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
|
||||
}
|
||||
};
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
|
||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
if (ggml_cuda_info().devices[device].vmm) {
|
||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||
}
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
|
||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
|
||||
}
|
||||
|
||||
@ -1020,114 +1020,6 @@ typedef void (*ggml_cuda_op_mul_mat_t)(
|
||||
|
||||
#define MUL_MAT_SRC1_COL_STRIDE 128
|
||||
|
||||
static __global__ void mul_mat_p021_f16_f32(
|
||||
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) {
|
||||
|
||||
const half * x = (const half *) vx;
|
||||
|
||||
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
||||
const int channel_x = channel / (nchannels_y / nchannels_x);
|
||||
|
||||
const int nrows_y = ncols_x;
|
||||
const int nrows_dst = nrows_x;
|
||||
const int row_dst = row_x;
|
||||
|
||||
float tmp = 0.0f;
|
||||
|
||||
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
||||
const int col_x = col_x0 + threadIdx.x;
|
||||
|
||||
if (col_x >= ncols_x) {
|
||||
break;
|
||||
}
|
||||
|
||||
// x is transposed and permuted
|
||||
const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x;
|
||||
const float xi = __half2float(x[ix]);
|
||||
|
||||
const int row_y = col_x;
|
||||
|
||||
// y is not transposed but permuted
|
||||
const int iy = channel*nrows_y + row_y;
|
||||
|
||||
tmp += xi * y[iy];
|
||||
}
|
||||
|
||||
// dst is not transposed and not permuted
|
||||
const int idst = channel*nrows_dst + row_dst;
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst[idst] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous
|
||||
const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x,
|
||||
const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) {
|
||||
|
||||
const half * x = (const half *) vx;
|
||||
|
||||
const int row_x = blockDim.y*blockIdx.y + threadIdx.y;
|
||||
const int channel = blockDim.z*blockIdx.z + threadIdx.z;
|
||||
const int channel_x = channel / channel_x_divisor;
|
||||
|
||||
const int nrows_y = ncols_x;
|
||||
const int nrows_dst = nrows_x;
|
||||
const int row_dst = row_x;
|
||||
|
||||
const int idst = channel*nrows_dst + row_dst;
|
||||
|
||||
float tmp = 0.0f;
|
||||
|
||||
for (int col_x0 = 0; col_x0 < ncols_x; col_x0 += blockDim.x) {
|
||||
const int col_x = col_x0 + threadIdx.x;
|
||||
|
||||
if (col_x >= ncols_x) {
|
||||
break;
|
||||
}
|
||||
|
||||
const int row_y = col_x;
|
||||
|
||||
const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x;
|
||||
const int iy = channel*nrows_y + row_y;
|
||||
|
||||
const float xi = __half2float(x[ix]);
|
||||
|
||||
tmp += xi * y[iy];
|
||||
}
|
||||
|
||||
// sum up partial sums and write back result
|
||||
tmp = warp_reduce_sum(tmp);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
dst[idst] = tmp;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_mul_mat_p021_f16_f32_cuda(
|
||||
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x,
|
||||
const int nchannels_x, const int nchannels_y, cudaStream_t stream) {
|
||||
|
||||
const dim3 block_nums(1, nrows_x, nchannels_y);
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
mul_mat_p021_f16_f32<<<block_nums, block_dims, 0, stream>>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y);
|
||||
}
|
||||
|
||||
static void ggml_mul_mat_vec_nc_f16_f32_cuda(
|
||||
const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x,
|
||||
const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) {
|
||||
|
||||
const dim3 block_nums(1, nrows_x, nchannels_y);
|
||||
const dim3 block_dims(WARP_SIZE, 1, 1);
|
||||
mul_mat_vec_nc_f16_f32<<<block_nums, block_dims, 0, stream>>>
|
||||
(vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x);
|
||||
}
|
||||
|
||||
static cudaError_t ggml_cuda_cpy_tensor_2d(
|
||||
void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) {
|
||||
|
||||
@ -1325,7 +1217,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
|
||||
static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
||||
void * dst, int dstDevice, size_t dpitch, void * src, int srcDevice, size_t spitch, size_t width, size_t height, cudaStream_t stream) {
|
||||
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
// cudaMemcpy2DAsync may fail with copies between vmm pools of different devices
|
||||
cudaMemcpy3DPeerParms p = {};
|
||||
p.dstDevice = dstDevice;
|
||||
@ -1339,7 +1231,7 @@ static cudaError_t ggml_cuda_Memcpy2DPeerAsync(
|
||||
GGML_UNUSED(dstDevice);
|
||||
GGML_UNUSED(srcDevice);
|
||||
return cudaMemcpy2DAsync(dst, dpitch, src, spitch, width, height, cudaMemcpyDeviceToDevice, stream);
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA)
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
|
||||
}
|
||||
|
||||
static void ggml_cuda_op_mul_mat(
|
||||
@ -1654,58 +1546,6 @@ static void ggml_cuda_op_mul_mat(
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
||||
GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
cudaStream_t main_stream = ctx.stream();
|
||||
|
||||
void * src0_ddq = src0->data;
|
||||
float * src1_ddf = (float *) src1->data;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
|
||||
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
||||
}
|
||||
|
||||
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
|
||||
const int64_t nb01 = src0->nb[1];
|
||||
const int64_t nb02 = src0->nb[2];
|
||||
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
|
||||
cudaStream_t main_stream = ctx.stream();
|
||||
|
||||
void * src0_ddq = src0->data;
|
||||
float * src1_ddf = (float *) src1->data;
|
||||
float * dst_ddf = (float *) dst->data;
|
||||
|
||||
const int64_t row_stride_x = nb01 / sizeof(half);
|
||||
const int64_t channel_stride_x = nb02 / sizeof(half);
|
||||
|
||||
ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream);
|
||||
}
|
||||
|
||||
static __global__ void k_compute_batched_ptrs(
|
||||
const half * src0_as_f16, const half * src1_as_f16, char * dst,
|
||||
const void ** ptrs_src, void ** ptrs_dst,
|
||||
@ -1879,21 +1719,17 @@ static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, co
|
||||
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
|
||||
|
||||
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
|
||||
bool use_mul_mat_vec = src0->type == GGML_TYPE_F16
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src0->ne[0] % (GGML_CUDA_DMMV_X*2) == 0 && src1->ne[1] == 1;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
&& src0->ne[0] % 2 == 0 && src1->ne[1] == 1;
|
||||
bool use_mul_mat_vec_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
|
||||
&& src1->ne[1] <= MMVQ_MAX_BATCH_SIZE;
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||
bool use_mul_mat_q = ggml_is_quantized(src0->type)
|
||||
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
|
||||
|
||||
// if mmvq is available it's a better choice than dmmv:
|
||||
#ifndef GGML_CUDA_FORCE_DMMV
|
||||
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
|
||||
#endif // GGML_CUDA_FORCE_DMMV
|
||||
|
||||
bool any_gpus_with_slow_fp16 = false;
|
||||
bool any_gpus_with_slow_fp16 = false;
|
||||
bool any_gpus_without_fp16_mma = false;
|
||||
|
||||
if (split) {
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
|
||||
@ -1904,14 +1740,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
continue;
|
||||
}
|
||||
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
|
||||
}
|
||||
} else {
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||
const int cc = ggml_cuda_info().devices[ctx.device].cc;
|
||||
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
|
||||
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
|
||||
any_gpus_without_fp16_mma = any_gpus_without_fp16_mma || !fp16_mma_available(cc);
|
||||
}
|
||||
|
||||
// debug helpers
|
||||
@ -1922,18 +1760,16 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
|
||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
// FP32 precision KQ single-batch for batch size 1 without FlashAttention
|
||||
ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst);
|
||||
} else if (!split && any_gpus_with_slow_fp16 && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// FP32 precision KQV single-batch for batch size 1 without FlashAttention
|
||||
ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst);
|
||||
if (!split && use_mul_mat_vec && dst->ne[3] == 1 && (src0->ne[1] < MMV_MAX_ROWS || any_gpus_without_fp16_mma)) {
|
||||
// the custom F16 vector kernel can be used over batched cuBLAS GEMM
|
||||
// but this is only faster for GPUs without tensor cores or with a thin src0 matrix (particularly KQV in attention)
|
||||
ggml_cuda_mul_mat_vec(ctx, src0, src1, dst);
|
||||
} else if (!split && src0->type == GGML_TYPE_F16 && (src1->type == GGML_TYPE_F16 || !any_gpus_with_slow_fp16)
|
||||
&& !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
// KQ + KQV multi-batch without FlashAttention
|
||||
// general KQ + KQV multi-batch without FlashAttention
|
||||
ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst);
|
||||
} else if (use_dequantize_mul_mat_vec) {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, nullptr);
|
||||
} else if (use_mul_mat_vec) {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec, nullptr);
|
||||
} else if (use_mul_mat_vec_q) {
|
||||
ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, quantize_row_q8_1_cuda);
|
||||
} else if (use_mul_mat_q) {
|
||||
@ -2978,6 +2814,17 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
|
||||
{
|
||||
struct ggml_tensor * a = op->src[0];
|
||||
struct ggml_tensor * b = op->src[1];
|
||||
// for small weight matrices the active device can end up without any rows, don't use row split in those cases
|
||||
// this avoids some edge cases (and the performance would not be good anyways)
|
||||
if (a->buffer && ggml_backend_buft_is_cuda_split(a->buffer->buft)) {
|
||||
ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) a->buffer->buft->context;
|
||||
int64_t row_low;
|
||||
int64_t row_high;
|
||||
get_row_split(&row_low, &row_high, a, buft_ctx->tensor_split, dev_ctx->device);
|
||||
if (row_low == row_high) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (b->type == GGML_TYPE_F16 && a->type != GGML_TYPE_F16) {
|
||||
return false;
|
||||
}
|
155
ggml/src/ggml-cuda/ggml/CMakeLists.txt
Normal file
155
ggml/src/ggml-cuda/ggml/CMakeLists.txt
Normal file
@ -0,0 +1,155 @@
|
||||
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
||||
|
||||
find_package(CUDAToolkit)
|
||||
|
||||
if (CUDAToolkit_FOUND)
|
||||
message(STATUS "CUDA Toolkit found")
|
||||
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
# native == GPUs available at build time
|
||||
# 52 == Maxwell, lowest CUDA 12 standard
|
||||
# 60 == P100, FP16 CUDA intrinsics
|
||||
# 61 == Pascal, __dp4a instruction (per-byte integer dot product)
|
||||
# 70 == V100, FP16 tensor cores
|
||||
# 75 == Turing, int8 tensor cores
|
||||
if (GGML_NATIVE AND CUDAToolkit_VERSION VERSION_GREATER_EQUAL "11.6" AND CMAKE_VERSION VERSION_GREATER_EQUAL "3.24")
|
||||
set(CMAKE_CUDA_ARCHITECTURES "native")
|
||||
elseif(GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
|
||||
endif()
|
||||
endif()
|
||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
||||
|
||||
enable_language(CUDA)
|
||||
|
||||
file(GLOB GGML_HEADERS_CUDA "*.cuh")
|
||||
list(APPEND GGML_HEADERS_CUDA "../../include/ggml-cuda.h")
|
||||
|
||||
file(GLOB GGML_SOURCES_CUDA "*.cu")
|
||||
file(GLOB SRCS "template-instances/fattn-wmma*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "template-instances/fattn-vec*.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||
else()
|
||||
file(GLOB SRCS "template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
file(GLOB SRCS "template-instances/fattn-vec*f16-f16.cu")
|
||||
list(APPEND GGML_SOURCES_CUDA ${SRCS})
|
||||
endif()
|
||||
|
||||
add_library(ggml-cuda
|
||||
${GGML_HEADERS_CUDA}
|
||||
${GGML_SOURCES_CUDA}
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-cuda PRIVATE ggml-base)
|
||||
target_include_directories(ggml-cuda PRIVATE . ..)
|
||||
|
||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||
|
||||
if (GGML_CUDA_GRAPHS)
|
||||
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
add_compile_definitions(GGML_CUDA_NO_VMM)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
||||
add_compile_definitions(GGML_CUDA_F16)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_PEER_COPY)
|
||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||
endif()
|
||||
|
||||
if (GGML_STATIC)
|
||||
if (WIN32)
|
||||
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||
else ()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||
endif()
|
||||
else()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_VMM)
|
||||
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
||||
else()
|
||||
target_link_libraries(ggml-cuda PRIVATE CUDA::cuda_driver)
|
||||
endif()
|
||||
|
||||
set(CUDA_CXX_FLAGS "")
|
||||
|
||||
set(CUDA_FLAGS -use_fast_math)
|
||||
|
||||
if (GGML_FATAL_WARNINGS)
|
||||
list(APPEND CUDA_FLAGS -Werror all-warnings)
|
||||
endif()
|
||||
|
||||
if (GGML_ALL_WARNINGS AND NOT MSVC)
|
||||
set(NVCC_CMD ${CMAKE_CUDA_COMPILER} .c)
|
||||
if (NOT CMAKE_CUDA_HOST_COMPILER STREQUAL "")
|
||||
list(APPEND NVCC_CMD -ccbin ${CMAKE_CUDA_HOST_COMPILER})
|
||||
endif()
|
||||
|
||||
execute_process(
|
||||
COMMAND ${NVCC_CMD} -Xcompiler --version
|
||||
OUTPUT_VARIABLE CUDA_CCFULLVER
|
||||
ERROR_QUIET
|
||||
)
|
||||
|
||||
if (NOT CUDA_CCFULLVER MATCHES clang)
|
||||
set(CUDA_CCID "GNU")
|
||||
execute_process(
|
||||
COMMAND ${NVCC_CMD} -Xcompiler "-dumpfullversion -dumpversion"
|
||||
OUTPUT_VARIABLE CUDA_CCVER
|
||||
ERROR_QUIET
|
||||
)
|
||||
else()
|
||||
if (CUDA_CCFULLVER MATCHES Apple)
|
||||
set(CUDA_CCID "AppleClang")
|
||||
else()
|
||||
set(CUDA_CCID "Clang")
|
||||
endif()
|
||||
string(REGEX REPLACE "^.* version ([0-9.]*).*$" "\\1" CUDA_CCVER ${CUDA_CCFULLVER})
|
||||
endif()
|
||||
|
||||
message("-- CUDA host compiler is ${CUDA_CCID} ${CUDA_CCVER}")
|
||||
|
||||
get_flags(${CUDA_CCID} ${CUDA_CCVER})
|
||||
list(APPEND CUDA_CXX_FLAGS ${CXX_FLAGS} ${GF_CXX_FLAGS}) # This is passed to -Xcompiler later
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
list(APPEND CUDA_CXX_FLAGS -Wno-pedantic)
|
||||
endif()
|
||||
|
||||
list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED) # pass host compiler flags as a single argument
|
||||
|
||||
if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
|
||||
list(APPEND CUDA_FLAGS -Xcompiler ${CUDA_CXX_FLAGS_JOINED})
|
||||
endif()
|
||||
|
||||
target_compile_options(ggml-cuda PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDA_FLAGS}>")
|
||||
else()
|
||||
message(FATAL_ERROR "CUDA Toolkit not found")
|
||||
endif()
|
@ -100,9 +100,9 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||
return 128;
|
||||
#else // INT8_MMA_AVAILABLE
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
return 128;
|
||||
#else // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#else // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
#if __CUDA_ARCH__ >= CC_VOLTA
|
||||
#ifdef GGML_CUDA_FORCE_MMQ
|
||||
@ -115,7 +115,7 @@ static constexpr __device__ int get_mmq_x_max_device() {
|
||||
return 64;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // INT8_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
@ -124,7 +124,7 @@ static constexpr int get_mmq_y_host(const int cc) {
|
||||
}
|
||||
|
||||
static constexpr __device__ int get_mmq_y_device() {
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(RDNA1)
|
||||
return 64;
|
||||
#else
|
||||
@ -136,7 +136,7 @@ static constexpr __device__ int get_mmq_y_device() {
|
||||
#else
|
||||
return 64;
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
#define MMQ_DP4A_TXS_Q4_0 tile_x_sizes{mmq_y*WARP_SIZE + mmq_y, mmq_y*WARP_SIZE/QI4_0 + mmq_y/QI4_0, 0}
|
||||
@ -2569,7 +2569,7 @@ static __device__ void mul_mat_q_process_tile(
|
||||
// The mul_mat_q kernel implements "stream-k" work partitioning as described in https://arxiv.org/abs/2301.03598
|
||||
|
||||
template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
#if defined(RDNA3) || defined(RDNA2)
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||
#endif // defined(RDNA3) || defined(RDNA2)
|
||||
@ -2579,7 +2579,7 @@ template <ggml_type type, int mmq_x, int nwarps, bool need_check>
|
||||
#else
|
||||
__launch_bounds__(WARP_SIZE*nwarps, 2)
|
||||
#endif // __CUDA_ARCH__ >= CC_VOLTA
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
static __global__ void mul_mat_q(
|
||||
const char * __restrict__ x, const char * __restrict__ yc, float * __restrict__ dst, float * __restrict__ tmp_fixup,
|
||||
const int ne00, const int ne01, const int stride01, const int ne10, const int ne11, const int stride11, const int ne0) {
|
||||
@ -2594,7 +2594,7 @@ static __global__ void mul_mat_q(
|
||||
constexpr int mmq_y = get_mmq_y_device();
|
||||
|
||||
// On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
|
||||
#if (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||
#if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||
{
|
||||
constexpr bool fixup = false;
|
||||
mul_mat_q_process_tile<type, mmq_x, nwarps, need_check, fixup>
|
||||
@ -2602,7 +2602,7 @@ static __global__ void mul_mat_q(
|
||||
blockIdx.x, blockIdx.y, 0, ne00/qk);
|
||||
return;
|
||||
}
|
||||
#endif // (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||
#endif // (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < CC_VOLTA
|
||||
|
||||
const int64_t blocks_per_ne00 = ne00 / qk;
|
||||
constexpr int blocks_per_iter = MMQ_ITER_K / qk;
|
||||
@ -2765,14 +2765,14 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
|
||||
const int shmem = mmq_get_shmem<type>(mmq_x, mmq_y, cc);
|
||||
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static bool shmem_limit_raised[GGML_CUDA_MAX_DEVICES] = {false};
|
||||
if (!shmem_limit_raised[id]) {
|
||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, false>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
||||
CUDA_CHECK(cudaFuncSetAttribute(mul_mat_q<type, mmq_x, MMQ_NWARPS, true>, cudaFuncAttributeMaxDynamicSharedMemorySize, shmem));
|
||||
shmem_limit_raised[id] = true;
|
||||
}
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
|
||||
const int nty = (args.ne01 + mmq_y - 1) / mmq_y;
|
||||
const int ntx = (args.ne11 + mmq_x - 1) / mmq_x;
|
||||
|
223
ggml/src/ggml-cuda/mmv.cu
Normal file
223
ggml/src/ggml-cuda/mmv.cu
Normal file
@ -0,0 +1,223 @@
|
||||
#include "common.cuh"
|
||||
#include "mmv.cuh"
|
||||
|
||||
template <typename type_acc, int block_size>
|
||||
static __global__ void mul_mat_vec(
|
||||
const half * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
|
||||
const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
|
||||
const int64_t row = blockIdx.x;
|
||||
const int64_t channel = blockIdx.z;
|
||||
const int tid = threadIdx.x;
|
||||
|
||||
x += (channel/channel_ratio)*stride_channel_x + row*stride_row;
|
||||
y += channel *stride_channel_y;
|
||||
dst += channel *stride_channel_dst;
|
||||
|
||||
const half2 * x2 = (const half2 *) x;
|
||||
const float2 * y2 = (const float2 *) y;
|
||||
|
||||
extern __shared__ char data_mmv[];
|
||||
float * buf_iw = (float *) data_mmv;
|
||||
|
||||
if (block_size > WARP_SIZE) {
|
||||
if (tid < WARP_SIZE) {
|
||||
buf_iw[tid] = 0.0f;
|
||||
}
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
float sumf;
|
||||
|
||||
if (std::is_same<type_acc, float>::value) {
|
||||
sumf = 0.0f;
|
||||
|
||||
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||
const float2 tmpx = __half22float2(x2[col2]);
|
||||
const float2 tmpy = y2[col2];
|
||||
sumf += tmpx.x * tmpy.x;
|
||||
sumf += tmpx.y * tmpy.y;
|
||||
}
|
||||
} else {
|
||||
#ifdef FP16_AVAILABLE
|
||||
half2 sumh2 = make_half2(0.0f, 0.0f);
|
||||
|
||||
for (int64_t col2 = tid; col2 < ncols2; col2 += block_size) {
|
||||
const float2 tmp = y2[col2];
|
||||
sumh2 += x2[col2] * make_half2(tmp.x, tmp.y);
|
||||
}
|
||||
|
||||
sumf = __low2float(sumh2) + __high2float(sumh2);
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
sumf = warp_reduce_sum(sumf);
|
||||
|
||||
if (block_size > WARP_SIZE) {
|
||||
buf_iw[tid/WARP_SIZE] = sumf;
|
||||
__syncthreads();
|
||||
if (tid > WARP_SIZE) {
|
||||
return;
|
||||
}
|
||||
sumf = buf_iw[tid];
|
||||
sumf = warp_reduce_sum(sumf);
|
||||
}
|
||||
|
||||
if (tid != 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
dst[row] = sumf;
|
||||
}
|
||||
|
||||
template <typename type_acc>
|
||||
static void launch_mul_mat_vec_cuda(
|
||||
const half * x, const float * y, float * dst,
|
||||
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
||||
cudaStream_t stream) {
|
||||
GGML_ASSERT(ncols % 2 == 0);
|
||||
GGML_ASSERT(stride_row % 2 == 0);
|
||||
GGML_ASSERT(nchannels_y % nchannels_x == 0);
|
||||
const int64_t channel_ratio = nchannels_y / nchannels_x;
|
||||
|
||||
int64_t block_size_best = WARP_SIZE;
|
||||
int64_t niter_best = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
|
||||
for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
|
||||
const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
|
||||
if (niter < niter_best) {
|
||||
niter_best = niter;
|
||||
block_size_best = block_size;
|
||||
}
|
||||
}
|
||||
|
||||
const int smem = WARP_SIZE*sizeof(float);
|
||||
const dim3 block_nums(nrows, 1, nchannels_y);
|
||||
const dim3 block_dims(block_size_best, 1, 1);
|
||||
switch (block_size_best) {
|
||||
case 32: {
|
||||
mul_mat_vec<type_acc, 32><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||
} break;
|
||||
case 64: {
|
||||
mul_mat_vec<type_acc, 64><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||
} break;
|
||||
case 96: {
|
||||
mul_mat_vec<type_acc, 96><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||
} break;
|
||||
case 128: {
|
||||
mul_mat_vec<type_acc, 128><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||
} break;
|
||||
case 160: {
|
||||
mul_mat_vec<type_acc, 160><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||
} break;
|
||||
case 192: {
|
||||
mul_mat_vec<type_acc, 192><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||
} break;
|
||||
case 224: {
|
||||
mul_mat_vec<type_acc, 224><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||
} break;
|
||||
case 256: {
|
||||
mul_mat_vec<type_acc, 256><<<block_nums, block_dims, smem, stream>>>
|
||||
(x, y, dst, ncols/2, stride_row, channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst);
|
||||
} break;
|
||||
default: {
|
||||
GGML_ABORT("fatal error");
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
static void mul_mat_vec_cuda(
|
||||
const half * x, const float * y, float * dst,
|
||||
const int64_t ncols, const int64_t nrows, const int64_t stride_row, const int64_t nchannels_x, const int64_t nchannels_y,
|
||||
const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst,
|
||||
enum ggml_prec prec, cudaStream_t stream) {
|
||||
switch (prec) {
|
||||
case GGML_PREC_DEFAULT: {
|
||||
launch_mul_mat_vec_cuda<half>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
|
||||
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
|
||||
} break;
|
||||
case GGML_PREC_F32: {
|
||||
launch_mul_mat_vec_cuda<float>(x, y, dst, ncols, nrows, stride_row, nchannels_x, nchannels_y,
|
||||
stride_channel_x, stride_channel_y, stride_channel_dst, stream);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t ne01 = src0->ne[1];
|
||||
|
||||
GGML_ASSERT(src1->ne[1] == 1);
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
|
||||
|
||||
const half * src0_d = (const half *) src0->data;
|
||||
const float * src1_d = (const float *) src1->data;
|
||||
float * dst_d = (float *) dst->data;
|
||||
|
||||
const int64_t ne02 = src0->ne[2];
|
||||
const int64_t ne12 = src1->ne[2];
|
||||
GGML_ASSERT(dst->ne[2] == ne12);
|
||||
|
||||
GGML_ASSERT(src0->ne[3] == 1);
|
||||
GGML_ASSERT(src1->ne[3] == 1);
|
||||
GGML_ASSERT( dst->ne[3] == 1);
|
||||
|
||||
const int64_t stride_row = src0->nb[1] / ggml_type_size(src0->type);
|
||||
const int64_t channel_stride_x = src0->nb[2] / ggml_type_size(src0->type);
|
||||
const int64_t channel_stride_y = src1->nb[2] / ggml_type_size(src1->type);
|
||||
const int64_t channel_stride_dst = dst->nb[2] / ggml_type_size( dst->type);
|
||||
|
||||
mul_mat_vec_cuda(src0_d, src1_d, dst_d, ne00, ne01, stride_row, ne02, ne12, channel_stride_x, channel_stride_y, channel_stride_dst, prec, ctx.stream());
|
||||
}
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream) {
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(dst->type == GGML_TYPE_F32);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
GGML_ASSERT(src1_ncols == 1);
|
||||
|
||||
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||
const enum ggml_prec prec = fast_fp16_available(cc) ? ggml_prec(dst->op_params[0]) : GGML_PREC_F32;
|
||||
|
||||
|
||||
// ggml_cuda_op provides single, contiguous matrices
|
||||
const int64_t stride_row = ne00;
|
||||
const int64_t nchannels_x = 1;
|
||||
const int64_t nchannels_y = 1;
|
||||
const int64_t channel_stride_x = 0;
|
||||
const int64_t channel_stride_y = 0;
|
||||
const int64_t channel_stride_dst = 0;
|
||||
|
||||
mul_mat_vec_cuda((const half *) src0_dd_i, src1_ddf_i, dst_dd_i, ne00, row_diff, stride_row,
|
||||
nchannels_x, nchannels_y, channel_stride_x, channel_stride_y, channel_stride_dst, prec, stream);
|
||||
|
||||
GGML_UNUSED(ctx);
|
||||
GGML_UNUSED(src1);
|
||||
GGML_UNUSED(dst);
|
||||
GGML_UNUSED(src1_ddq_i);
|
||||
GGML_UNUSED(src1_ncols);
|
||||
GGML_UNUSED(src1_padded_row_size);
|
||||
}
|
12
ggml/src/ggml-cuda/mmv.cuh
Normal file
12
ggml/src/ggml-cuda/mmv.cuh
Normal file
@ -0,0 +1,12 @@
|
||||
#include "common.cuh"
|
||||
|
||||
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
|
||||
#define MMV_MAX_ROWS 512
|
||||
|
||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
|
||||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, cudaStream_t stream);
|
@ -48,10 +48,10 @@ static constexpr __device__ int get_vdr_mmvq(ggml_type type) {
|
||||
}
|
||||
|
||||
template <ggml_type type, int ncols_y>
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
// tell the compiler to use as many registers as it wants, see nwarps definition below
|
||||
__launch_bounds__((ncols_y <= 4 ? 4 : 2)*WARP_SIZE, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void mul_mat_vec_q(
|
||||
const void * __restrict__ vx, const void * __restrict__ vy, float * __restrict__ dst,
|
||||
const int ncols_x, const int nrows_x, const int nrows_y, const int nrows_dst) {
|
||||
@ -62,13 +62,13 @@ static __global__ void mul_mat_vec_q(
|
||||
|
||||
constexpr vec_dot_q_cuda_t vec_dot_q_cuda = get_vec_dot_q_cuda(type);
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
||||
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && (defined(RDNA2) || defined(RDNA3))
|
||||
constexpr int nwarps = 1;
|
||||
constexpr int rows_per_cuda_block = 1;
|
||||
#else
|
||||
constexpr int nwarps = ncols_y <= 4 ? 4 : 2;
|
||||
constexpr int rows_per_cuda_block = ncols_y == 1 ? 1 : 2;
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__) && !defined(RDNA2) && !defined(RDNA3)
|
||||
|
||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||
const int row0 = rows_per_cuda_block*blockIdx.x;
|
||||
|
@ -1,11 +1,11 @@
|
||||
#include "ggml-impl.h"
|
||||
#include "opt-step-adamw.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
static __global__ void opt_step_adamw_f32(
|
||||
float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v, const int64_t k,
|
||||
const float alpha, const float beta1, const float beta2, const float eps, const float wd,
|
||||
const float beta1h, const float beta2h) {
|
||||
float * __restrict__ x, const float * __restrict__ g, float * __restrict__ g_m, float * __restrict__ g_v,
|
||||
const float * __restrict__ pars, const int64_t k) {
|
||||
|
||||
const int64_t i = (int64_t) blockIdx.x*blockDim.x + threadIdx.x;
|
||||
|
||||
@ -13,6 +13,14 @@ static __global__ void opt_step_adamw_f32(
|
||||
return;
|
||||
}
|
||||
|
||||
const float alpha = pars[0];
|
||||
const float beta1 = pars[1];
|
||||
const float beta2 = pars[2];
|
||||
const float eps = pars[3];
|
||||
const float wd = pars[4];
|
||||
const float beta1h = pars[5];
|
||||
const float beta2h = pars[6];
|
||||
|
||||
const float gi = g[i];
|
||||
const float gmi = g_m[i]*beta1 + gi*(1.0f - beta1);
|
||||
const float gvi = g_v[i]*beta2 + gi*gi*(1.0f - beta2);
|
||||
@ -23,58 +31,48 @@ static __global__ void opt_step_adamw_f32(
|
||||
const float mh = gmi*beta1h;
|
||||
const float vh = sqrtf(gvi*beta2h) + eps;
|
||||
|
||||
x[i] = x[i]*(1.0f - alpha*wd) - mh/vh;
|
||||
x[i] = x[i]*(1.0f - alpha*wd) - alpha*mh/vh;
|
||||
}
|
||||
|
||||
static void opt_step_adamw_f32_cuda(
|
||||
float * x, const float * g, float * g_m, float * g_v, const int64_t k,
|
||||
const float alpha, const float beta1, const float beta2, const float eps, const float wd,
|
||||
const float beta1h, const float beta2h, cudaStream_t stream) {
|
||||
float * x, const float * g, float * g_m, float * g_v, const float * pars, const int64_t k, cudaStream_t stream) {
|
||||
|
||||
const dim3 block_dims(CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
|
||||
const dim3 block_nums((k + CUDA_OPT_STEP_ADAMW_BLOCK_SIZE - 1) / CUDA_OPT_STEP_ADAMW_BLOCK_SIZE, 1, 1);
|
||||
opt_step_adamw_f32<<<block_nums, block_dims, 0, stream>>>(x, g, g_m, g_v, k, alpha, beta1, beta2, eps, wd, beta1h, beta2h);
|
||||
opt_step_adamw_f32<<<block_nums, block_dims, 0, stream>>>(x, g, g_m, g_v, pars, k);
|
||||
}
|
||||
|
||||
void ggml_cuda_opt_step_adamw(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src0_grad = dst->src[1];
|
||||
const ggml_tensor * src0_grad_m = dst->src[2];
|
||||
const ggml_tensor * src0_grad_v = dst->src[3];
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src0_grad = dst->src[1];
|
||||
const ggml_tensor * src0_grad_m = dst->src[2];
|
||||
const ggml_tensor * src0_grad_v = dst->src[3];
|
||||
const ggml_tensor * adamw_params = dst->src[4];
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0_grad->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0_grad_m->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0_grad_v->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0_grad->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0_grad_m->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(src0_grad_v->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(adamw_params->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0_grad));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0_grad_m));
|
||||
GGML_ASSERT(ggml_is_contiguous(src0_grad_v));
|
||||
GGML_ASSERT(ggml_is_contiguous(adamw_params));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_m));
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src0_grad_v));
|
||||
GGML_ASSERT(ggml_nelements(adamw_params) == 7);
|
||||
|
||||
float * src0_d = (float *) src0->data;
|
||||
const float * src0_grad_d = (const float *) src0_grad->data;
|
||||
float * src0_grad_m_d = (float *) src0_grad_m->data;
|
||||
float * src0_grad_v_d = (float *) src0_grad_v->data;
|
||||
float * src0_d = (float *) src0->data;
|
||||
const float * src0_grad_d = (const float *) src0_grad->data;
|
||||
float * src0_grad_m_d = (float *) src0_grad_m->data;
|
||||
float * src0_grad_v_d = (float *) src0_grad_v->data;
|
||||
const float * adamw_params_d = (const float *) adamw_params->data;
|
||||
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
const int64_t ne = ggml_nelements(src0);
|
||||
|
||||
int64_t iter; memcpy(&iter, &dst->op_params[0], sizeof(int64_t));
|
||||
float alpha; memcpy(&alpha, &dst->op_params[2], sizeof(float));
|
||||
float beta1; memcpy(&beta1, &dst->op_params[3], sizeof(float));
|
||||
float beta2; memcpy(&beta2, &dst->op_params[4], sizeof(float));
|
||||
float eps; memcpy(&eps, &dst->op_params[5], sizeof(float));
|
||||
float wd; memcpy(&wd, &dst->op_params[6], sizeof(float));
|
||||
|
||||
const float beta1h = alpha/(1.0f - powf(beta1, iter));
|
||||
const float beta2h = 1.0f/(1.0f - powf(beta2, iter));
|
||||
|
||||
opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, ne, alpha, beta1, beta2, eps, wd, beta1h, beta2h, stream);
|
||||
|
||||
iter++;
|
||||
memcpy(&dst->op_params[0], &iter, sizeof(int64_t));
|
||||
opt_step_adamw_f32_cuda(src0_d, src0_grad_d, src0_grad_m_d, src0_grad_v_d, adamw_params_d, ne, stream);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||
#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||
#define USE_CUB
|
||||
#endif // !defined(GGML_USE_HIPBLAS) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||
#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11700
|
||||
|
||||
#ifdef USE_CUB
|
||||
// On Windows CUB uses libraries with variables called CC_PASCAL which conflict with the define in common.cuh.
|
||||
|
106
ggml/src/ggml-hip/CMakeLists.txt
Normal file
106
ggml/src/ggml-hip/CMakeLists.txt
Normal file
@ -0,0 +1,106 @@
|
||||
if (NOT EXISTS $ENV{ROCM_PATH})
|
||||
if (NOT EXISTS /opt/rocm)
|
||||
set(ROCM_PATH /usr)
|
||||
else()
|
||||
set(ROCM_PATH /opt/rocm)
|
||||
endif()
|
||||
else()
|
||||
set(ROCM_PATH $ENV{ROCM_PATH})
|
||||
endif()
|
||||
|
||||
list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
|
||||
list(APPEND CMAKE_PREFIX_PATH "${ROCM_PATH}/lib64/cmake")
|
||||
|
||||
# CMake on Windows doesn't support the HIP language yet
|
||||
if (WIN32)
|
||||
set(CXX_IS_HIPCC TRUE)
|
||||
else()
|
||||
string(REGEX MATCH "hipcc(\.bat)?$" CXX_IS_HIPCC "${CMAKE_CXX_COMPILER}")
|
||||
endif()
|
||||
|
||||
if (CXX_IS_HIPCC)
|
||||
if (LINUX)
|
||||
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
||||
endif()
|
||||
|
||||
message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
|
||||
" Prefer setting the HIP compiler directly. See README for details.")
|
||||
endif()
|
||||
else()
|
||||
# Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
|
||||
if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
|
||||
set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
|
||||
endif()
|
||||
cmake_minimum_required(VERSION 3.21)
|
||||
enable_language(HIP)
|
||||
endif()
|
||||
|
||||
find_package(hip REQUIRED)
|
||||
find_package(hipblas REQUIRED)
|
||||
find_package(rocblas REQUIRED)
|
||||
|
||||
message(STATUS "HIP and hipBLAS found")
|
||||
|
||||
file(GLOB GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
|
||||
list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
|
||||
|
||||
file(GLOB GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-wmma*.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
|
||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
|
||||
else()
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
|
||||
list(APPEND GGML_SOURCES_ROCM ${SRCS})
|
||||
endif()
|
||||
|
||||
add_library(ggml-hip
|
||||
${GGML_HEADERS_ROCM}
|
||||
${GGML_SOURCES_ROCM})
|
||||
|
||||
target_link_libraries(ggml-hip PRIVATE ggml-base)
|
||||
target_include_directories(ggml-hip PRIVATE . ..)
|
||||
|
||||
# TODO: do not use CUDA definitions for HIP
|
||||
target_compile_definitions(ggml PUBLIC GGML_USE_CUDA)
|
||||
|
||||
add_compile_definitions(GGML_USE_HIP)
|
||||
|
||||
if (GGML_HIP_UMA)
|
||||
add_compile_definitions(GGML_HIP_UMA)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_FORCE_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
||||
endif()
|
||||
|
||||
if (GGML_CUDA_NO_PEER_COPY)
|
||||
add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
|
||||
endif()
|
||||
|
||||
if (CXX_IS_HIPCC)
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
|
||||
target_link_libraries(ggml-hip PRIVATE hip::device)
|
||||
else()
|
||||
set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE HIP)
|
||||
endif()
|
||||
|
||||
if (GGML_STATIC)
|
||||
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
||||
endif()
|
||||
|
||||
target_link_libraries(ggml-hip PRIVATE ggml-base hip::host roc::rocblas roc::hipblas)
|
@ -3,13 +3,29 @@
|
||||
// GGML internal header
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
#include <assert.h>
|
||||
#include <math.h>
|
||||
#include <stdlib.h> // load `stdlib.h` before other headers to work around MinGW bug: https://sourceforge.net/p/mingw-w64/bugs/192/
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef __ARM_FEATURE_SVE
|
||||
#include <arm_sve.h>
|
||||
#endif // __ARM_FEATURE_SVE
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
||||
//
|
||||
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
||||
//
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
#if defined(__F16C__)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -28,13 +44,13 @@ extern "C" {
|
||||
// if C99 - static_assert is noop
|
||||
// ref: https://stackoverflow.com/a/53923785/4039976
|
||||
#ifndef __cplusplus
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
#ifndef static_assert
|
||||
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201100L)
|
||||
#define static_assert(cond, msg) _Static_assert(cond, msg)
|
||||
#else
|
||||
#define static_assert(cond, msg) struct global_scope_noop_trick
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static inline int ggml_up32(int n) {
|
||||
@ -120,14 +136,12 @@ struct ggml_map_custom1_op_params {
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
|
||||
struct ggml_map_custom2_op_params {
|
||||
ggml_custom2_op_t fun;
|
||||
int n_tasks;
|
||||
void * userdata;
|
||||
};
|
||||
|
||||
|
||||
struct ggml_map_custom3_op_params {
|
||||
ggml_custom3_op_t fun;
|
||||
int n_tasks;
|
||||
@ -182,7 +196,7 @@ void ggml_hash_set_reset(struct ggml_hash_set * hash_set);
|
||||
static bool ggml_hash_contains(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
|
||||
// returns GGML_HASHSET_FULL if table is full, otherwise the current index of the key or where it should be inserted
|
||||
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key);
|
||||
|
||||
// returns GGML_HASHSET_ALREADY_EXISTS if key already exists, index otherwise, asserts if table is full
|
||||
static size_t ggml_hash_insert(struct ggml_hash_set * hash_set, struct ggml_tensor * key);
|
||||
@ -196,7 +210,7 @@ static inline size_t ggml_hash(const struct ggml_tensor * p) {
|
||||
return (size_t)(uintptr_t)p >> 4;
|
||||
}
|
||||
|
||||
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, struct ggml_tensor * key) {
|
||||
static size_t ggml_hash_find(const struct ggml_hash_set * hash_set, const struct ggml_tensor * key) {
|
||||
size_t h = ggml_hash(key) % hash_set->size;
|
||||
|
||||
// linear probing
|
||||
@ -267,13 +281,14 @@ enum ggml_cgraph_eval_order {
|
||||
};
|
||||
|
||||
struct ggml_cgraph {
|
||||
int size;
|
||||
int n_nodes;
|
||||
int n_leafs;
|
||||
int size; // maximum number of nodes/leafs/grads/grad_accs
|
||||
int n_nodes; // number of nodes currently in use
|
||||
int n_leafs; // number of leafs currently in use
|
||||
|
||||
struct ggml_tensor ** nodes;
|
||||
struct ggml_tensor ** grads;
|
||||
struct ggml_tensor ** leafs;
|
||||
struct ggml_tensor ** nodes; // tensors with data that can change if the graph is evaluated
|
||||
struct ggml_tensor ** grads; // the outputs of these tensors are the gradients of the nodes
|
||||
struct ggml_tensor ** grad_accs; // accumulators for node gradients
|
||||
struct ggml_tensor ** leafs; // tensors with constant data
|
||||
|
||||
struct ggml_hash_set visited_hash_set;
|
||||
|
||||
@ -287,9 +302,249 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
|
||||
void * ggml_aligned_malloc(size_t size);
|
||||
void ggml_aligned_free(void * ptr, size_t size);
|
||||
|
||||
// TODO: move to threading file
|
||||
void ggml_critical_section_start(void);
|
||||
void ggml_critical_section_end(void);
|
||||
// FP16 to FP32 conversion
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
#ifdef _MSC_VER
|
||||
typedef uint16_t ggml_fp16_internal_t;
|
||||
#else
|
||||
typedef __fp16 ggml_fp16_internal_t;
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
ggml_fp16_internal_t tmp;
|
||||
memcpy(&tmp, &h, sizeof(ggml_fp16_t));
|
||||
return (float)tmp;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
ggml_fp16_t res;
|
||||
ggml_fp16_internal_t tmp = f;
|
||||
memcpy(&res, &tmp, sizeof(ggml_fp16_t));
|
||||
return res;
|
||||
}
|
||||
|
||||
#elif defined(__F16C__)
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
|
||||
#else
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
|
||||
#endif
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
/* the inline asm below is about 12% faster than the lookup method */
|
||||
#define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
register float f;
|
||||
register double d;
|
||||
__asm__(
|
||||
"mtfprd %0,%2\n"
|
||||
"xscvhpdp %0,%0\n"
|
||||
"frsp %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=f"(f):
|
||||
/* in */ "r"(h));
|
||||
return f;
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
register double d;
|
||||
register ggml_fp16_t r;
|
||||
__asm__( /* xscvdphp can work on double or single precision */
|
||||
"xscvdphp %0,%2\n"
|
||||
"mffprd %1,%0\n" :
|
||||
/* temp */ "=d"(d),
|
||||
/* out */ "=r"(r):
|
||||
/* in */ "f"(f));
|
||||
return r;
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
// FP16 <-> FP32
|
||||
// ref: https://github.com/Maratyszcza/FP16
|
||||
|
||||
static inline float fp32_from_bits(uint32_t w) {
|
||||
union {
|
||||
uint32_t as_bits;
|
||||
float as_value;
|
||||
} fp32;
|
||||
fp32.as_bits = w;
|
||||
return fp32.as_value;
|
||||
}
|
||||
|
||||
static inline uint32_t fp32_to_bits(float f) {
|
||||
union {
|
||||
float as_value;
|
||||
uint32_t as_bits;
|
||||
} fp32;
|
||||
fp32.as_value = f;
|
||||
return fp32.as_bits;
|
||||
}
|
||||
|
||||
static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
|
||||
const uint32_t w = (uint32_t) h << 16;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
const uint32_t two_w = w + w;
|
||||
|
||||
const uint32_t exp_offset = UINT32_C(0xE0) << 23;
|
||||
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
|
||||
const float exp_scale = 0x1.0p-112f;
|
||||
#else
|
||||
const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
|
||||
#endif
|
||||
const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
|
||||
|
||||
const uint32_t magic_mask = UINT32_C(126) << 23;
|
||||
const float magic_bias = 0.5f;
|
||||
const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
|
||||
|
||||
const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
|
||||
const uint32_t result = sign |
|
||||
(two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
|
||||
return fp32_from_bits(result);
|
||||
}
|
||||
|
||||
static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
|
||||
#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
|
||||
const float scale_to_inf = 0x1.0p+112f;
|
||||
const float scale_to_zero = 0x1.0p-110f;
|
||||
#else
|
||||
const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
|
||||
const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
|
||||
#endif
|
||||
float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
|
||||
|
||||
const uint32_t w = fp32_to_bits(f);
|
||||
const uint32_t shl1_w = w + w;
|
||||
const uint32_t sign = w & UINT32_C(0x80000000);
|
||||
uint32_t bias = shl1_w & UINT32_C(0xFF000000);
|
||||
if (bias < UINT32_C(0x71000000)) {
|
||||
bias = UINT32_C(0x71000000);
|
||||
}
|
||||
|
||||
base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
|
||||
const uint32_t bits = fp32_to_bits(base);
|
||||
const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
|
||||
const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
|
||||
const uint32_t nonsign = exp_bits + mantissa_bits;
|
||||
return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
|
||||
}
|
||||
|
||||
#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
|
||||
#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
|
||||
|
||||
#endif // defined(__ARM_NEON) && (!defined(__MSC_VER)
|
||||
|
||||
// precomputed f32 table for f16 (256 KB)
|
||||
// defined in ggml.c, initialized in ggml_init()
|
||||
GGML_API float ggml_table_f32_f16[1 << 16];
|
||||
|
||||
// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
|
||||
// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
|
||||
// This is also true for POWER9.
|
||||
#if !defined(GGML_FP16_TO_FP32)
|
||||
inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
|
||||
uint16_t s;
|
||||
memcpy(&s, &f, sizeof(uint16_t));
|
||||
return ggml_table_f32_f16[s];
|
||||
}
|
||||
|
||||
#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
|
||||
#endif
|
||||
|
||||
#if !defined(GGML_FP32_TO_FP16)
|
||||
#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Converts brain16 to float32.
|
||||
*
|
||||
* The bfloat16 floating point format has the following structure:
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───┐
|
||||
* 0b0000000000000000 brain16
|
||||
*
|
||||
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||
* encoding and decoding numbers becomes relatively straightforward.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌──┴───┐┌─┴───────────────────┐
|
||||
* 0b00000000000000000000000000000000 IEEE binary32
|
||||
*
|
||||
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||
*
|
||||
* ┌sign
|
||||
* │
|
||||
* │ ┌exponent
|
||||
* │ │
|
||||
* │ │ ┌mantissa
|
||||
* │ │ │
|
||||
* │┌─┴─┐┌─┴──────┐
|
||||
* 0b0000000000000000 IEEE binary16
|
||||
*
|
||||
* @see IEEE 754-2008
|
||||
*/
|
||||
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.i = (uint32_t)h.bits << 16;
|
||||
return u.f;
|
||||
}
|
||||
|
||||
/**
|
||||
* Converts float32 to brain16.
|
||||
*
|
||||
* This is binary identical with Google Brain float conversion.
|
||||
* Floats shall round to nearest even, and NANs shall be quiet.
|
||||
* Subnormals aren't flushed to zero, except perhaps when used.
|
||||
* This code should vectorize nicely if using modern compilers.
|
||||
*/
|
||||
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||
ggml_bf16_t h;
|
||||
union {
|
||||
float f;
|
||||
uint32_t i;
|
||||
} u;
|
||||
u.f = s;
|
||||
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||
return h;
|
||||
}
|
||||
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||
return h;
|
||||
}
|
||||
|
||||
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
162
ggml/src/ggml-kompute/CMakeLists.txt
Normal file
162
ggml/src/ggml-kompute/CMakeLists.txt
Normal file
@ -0,0 +1,162 @@
|
||||
|
||||
find_package(Vulkan COMPONENTS glslc REQUIRED)
|
||||
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
|
||||
|
||||
if (NOT glslc_executable)
|
||||
message(FATAL_ERROR "glslc not found")
|
||||
endif()
|
||||
|
||||
add_library(ggml-kompute
|
||||
ggml-kompute.cpp
|
||||
../../include/ggml-kompute.h
|
||||
)
|
||||
|
||||
target_link_libraries(ggml-kompute PRIVATE ggml-base kompute)
|
||||
target_include_directories(ggml-kompute PRIVATE . .. ${CMAKE_CURRENT_BINARY_DIR})
|
||||
|
||||
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
|
||||
|
||||
function(compile_shader)
|
||||
set(options)
|
||||
set(oneValueArgs)
|
||||
set(multiValueArgs SOURCES)
|
||||
cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||
foreach(source ${compile_shader_SOURCES})
|
||||
get_filename_component(filename ${source} NAME)
|
||||
set(spv_file ${filename}.spv)
|
||||
add_custom_command(
|
||||
OUTPUT ${spv_file}
|
||||
DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp
|
||||
COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source}
|
||||
COMMENT "Compiling ${source} to ${spv_file}"
|
||||
)
|
||||
|
||||
get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
|
||||
set(FILE_NAME "shader${RAW_FILE_NAME}")
|
||||
string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
|
||||
string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
|
||||
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
|
||||
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
|
||||
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
|
||||
if(CMAKE_GENERATOR MATCHES "Visual Studio")
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$<CONFIG>/xxd"
|
||||
)
|
||||
else()
|
||||
add_custom_command(
|
||||
OUTPUT ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
|
||||
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
|
||||
DEPENDS ${spv_file} xxd
|
||||
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
|
||||
)
|
||||
endif()
|
||||
endforeach()
|
||||
endfunction()
|
||||
|
||||
if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt")
|
||||
message(STATUS "Kompute found")
|
||||
set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level")
|
||||
add_subdirectory(kompute)
|
||||
|
||||
# Compile our shaders
|
||||
compile_shader(SOURCES
|
||||
kompute-shaders/op_scale.comp
|
||||
kompute-shaders/op_scale_8.comp
|
||||
kompute-shaders/op_add.comp
|
||||
kompute-shaders/op_addrow.comp
|
||||
kompute-shaders/op_mul.comp
|
||||
kompute-shaders/op_silu.comp
|
||||
kompute-shaders/op_relu.comp
|
||||
kompute-shaders/op_gelu.comp
|
||||
kompute-shaders/op_softmax.comp
|
||||
kompute-shaders/op_norm.comp
|
||||
kompute-shaders/op_rmsnorm.comp
|
||||
kompute-shaders/op_diagmask.comp
|
||||
kompute-shaders/op_mul_mat_mat_f32.comp
|
||||
kompute-shaders/op_mul_mat_f16.comp
|
||||
kompute-shaders/op_mul_mat_q8_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_0.comp
|
||||
kompute-shaders/op_mul_mat_q4_1.comp
|
||||
kompute-shaders/op_mul_mat_q4_k.comp
|
||||
kompute-shaders/op_mul_mat_q6_k.comp
|
||||
kompute-shaders/op_getrows_f32.comp
|
||||
kompute-shaders/op_getrows_f16.comp
|
||||
kompute-shaders/op_getrows_q4_0.comp
|
||||
kompute-shaders/op_getrows_q4_1.comp
|
||||
kompute-shaders/op_getrows_q6_k.comp
|
||||
kompute-shaders/op_rope_f16.comp
|
||||
kompute-shaders/op_rope_f32.comp
|
||||
kompute-shaders/op_cpy_f16_f16.comp
|
||||
kompute-shaders/op_cpy_f16_f32.comp
|
||||
kompute-shaders/op_cpy_f32_f16.comp
|
||||
kompute-shaders/op_cpy_f32_f32.comp
|
||||
)
|
||||
|
||||
# Create a custom target for our generated shaders
|
||||
add_custom_target(generated_shaders DEPENDS
|
||||
shaderop_scale.h
|
||||
shaderop_scale_8.h
|
||||
shaderop_add.h
|
||||
shaderop_addrow.h
|
||||
shaderop_mul.h
|
||||
shaderop_silu.h
|
||||
shaderop_relu.h
|
||||
shaderop_gelu.h
|
||||
shaderop_softmax.h
|
||||
shaderop_norm.h
|
||||
shaderop_rmsnorm.h
|
||||
shaderop_diagmask.h
|
||||
shaderop_mul_mat_mat_f32.h
|
||||
shaderop_mul_mat_f16.h
|
||||
shaderop_mul_mat_q8_0.h
|
||||
shaderop_mul_mat_q4_0.h
|
||||
shaderop_mul_mat_q4_1.h
|
||||
shaderop_mul_mat_q4_k.h
|
||||
shaderop_mul_mat_q6_k.h
|
||||
shaderop_getrows_f32.h
|
||||
shaderop_getrows_f16.h
|
||||
shaderop_getrows_q4_0.h
|
||||
shaderop_getrows_q4_1.h
|
||||
shaderop_getrows_q6_k.h
|
||||
shaderop_rope_f16.h
|
||||
shaderop_rope_f32.h
|
||||
shaderop_cpy_f16_f16.h
|
||||
shaderop_cpy_f16_f32.h
|
||||
shaderop_cpy_f32_f16.h
|
||||
shaderop_cpy_f32_f32.h
|
||||
)
|
||||
|
||||
# Create a custom command that depends on the generated_shaders
|
||||
add_custom_command(
|
||||
OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp
|
||||
DEPENDS generated_shaders
|
||||
COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp"
|
||||
)
|
||||
|
||||
# Add the stamp to the main sources to ensure dependency tracking
|
||||
target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
|
||||
else()
|
||||
message(WARNING "Kompute not found")
|
||||
endif()
|
@ -15,6 +15,7 @@
|
||||
#define TWOPI_F 6.283185307179586f
|
||||
|
||||
#define QK_K 256
|
||||
#define K_SCALE_SIZE 12
|
||||
|
||||
#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx])
|
||||
#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx)
|
||||
@ -64,6 +65,14 @@ mat4 dequantize_q4_1(const block_q4_1 xb, uint il) {
|
||||
return reg;
|
||||
}
|
||||
|
||||
#define sizeof_block_q4_k 144
|
||||
struct block_q4_k {
|
||||
float16_t d;
|
||||
float16_t dmin;
|
||||
uint8_t scales[K_SCALE_SIZE];
|
||||
uint8_t qs[QK_K/2];
|
||||
};
|
||||
|
||||
#define sizeof_block_q6_k 210
|
||||
struct block_q6_k {
|
||||
uint8_t ql[QK_K/2]; // quants, lower 4 bits
|
133
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
Normal file
133
ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp
Normal file
@ -0,0 +1,133 @@
|
||||
#version 450
|
||||
|
||||
#include "common.comp"
|
||||
|
||||
#define N_DST 4
|
||||
#define SIZE_OF_BLOCK sizeof_block_q4_k
|
||||
|
||||
layout(local_size_x = 4) in;
|
||||
layout(local_size_y = 8) in;
|
||||
layout(local_size_z = 1) in;
|
||||
|
||||
layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; };
|
||||
layout (binding = 1) readonly buffer tensorInB { float inB[]; };
|
||||
layout (binding = 2) writeonly buffer tensorOut { float out_[]; };
|
||||
|
||||
layout (push_constant) uniform parameter {
|
||||
uint inAOff;
|
||||
uint inBOff;
|
||||
uint outOff;
|
||||
int ne00;
|
||||
int ne10;
|
||||
int ne0;
|
||||
int ne1;
|
||||
int ne01;
|
||||
int ne02;
|
||||
int ne12;
|
||||
int r2;
|
||||
int r3;
|
||||
} pcs;
|
||||
|
||||
void main() {
|
||||
const uint16_t kmask1 = uint16_t(0x3f3f);
|
||||
const uint16_t kmask2 = uint16_t(0x0f0f);
|
||||
const uint16_t kmask3 = uint16_t(0xc0c0);
|
||||
|
||||
const uint ix = gl_SubgroupInvocationID/8; // 0...3
|
||||
const uint it = gl_SubgroupInvocationID%8; // 0...7
|
||||
const uint iq = it/4; // 0 or 1
|
||||
const uint ir = it%4; // 0...3
|
||||
|
||||
const uint nb = pcs.ne00/QK_K;
|
||||
|
||||
const uint r0 = gl_WorkGroupID.x;
|
||||
const uint r1 = gl_WorkGroupID.y;
|
||||
const uint im = gl_WorkGroupID.z;
|
||||
|
||||
const uint first_row = r0 * N_DST;
|
||||
const uint ib_row = first_row * nb;
|
||||
|
||||
const uint i12 = im%pcs.ne12;
|
||||
const uint i13 = im/pcs.ne12;
|
||||
|
||||
const uint offset0 = (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02);
|
||||
|
||||
const uint xblk = ib_row + offset0 + pcs.inAOff;
|
||||
const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff;
|
||||
|
||||
float yl[16];
|
||||
float yh[16];
|
||||
float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f};
|
||||
float all_sum = 0.f;
|
||||
|
||||
uint y4 = y + ix * QK_K + 64 * iq + 8 * ir;
|
||||
|
||||
for (uint ib = ix; ib < nb; ib += 4) {
|
||||
const uint blk_idx = ib + xblk;
|
||||
|
||||
float sumy[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; ++i) {
|
||||
yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0];
|
||||
yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8];
|
||||
yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0];
|
||||
yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8];
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; row++) {
|
||||
uint row_idx = row * nb;
|
||||
|
||||
uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0);
|
||||
uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2);
|
||||
uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4);
|
||||
uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6);
|
||||
uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8);
|
||||
|
||||
uint16_t sc16[4];
|
||||
sc16[0] = sc_0 & kmask1;
|
||||
sc16[1] = sc_2 & kmask1;
|
||||
sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2);
|
||||
sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2);
|
||||
|
||||
float acc1[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
float acc2[4] = {0.f, 0.f, 0.f, 0.f};
|
||||
for (int i = 0; i < 8; i += 2) {
|
||||
uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i);
|
||||
uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i);
|
||||
acc1[0] += yl[i+0] * (q1 & 0x000F);
|
||||
acc1[1] += yl[i+1] * (q1 & 0x0F00);
|
||||
acc1[2] += yl[i+8] * (q1 & 0x00F0);
|
||||
acc1[3] += yl[i+9] * (q1 & 0xF000);
|
||||
acc2[0] += yh[i+0] * (q2 & 0x000F);
|
||||
acc2[1] += yh[i+1] * (q2 & 0x0F00);
|
||||
acc2[2] += yh[i+8] * (q2 & 0x00F0);
|
||||
acc2[3] += yh[i+9] * (q2 & 0xF000);
|
||||
}
|
||||
|
||||
uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF);
|
||||
uint8_t sc8_1 = uint8_t(sc16[0] >> 8 );
|
||||
uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF);
|
||||
uint8_t sc8_3 = uint8_t(sc16[1] >> 8 );
|
||||
uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF);
|
||||
uint8_t sc8_5 = uint8_t(sc16[2] >> 8 );
|
||||
uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF);
|
||||
uint8_t sc8_7 = uint8_t(sc16[3] >> 8 );
|
||||
|
||||
float dall = float(inA[blk_idx + row_idx].d);
|
||||
float dmin = float(inA[blk_idx + row_idx].dmin);
|
||||
sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 +
|
||||
(acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f +
|
||||
(acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 +
|
||||
(acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) -
|
||||
dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7);
|
||||
}
|
||||
|
||||
y4 += 4 * QK_K;
|
||||
}
|
||||
|
||||
for (int row = 0; row < N_DST; ++row) {
|
||||
all_sum = subgroupAdd(sumf[row]);
|
||||
if (subgroupElect()) {
|
||||
out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum;
|
||||
}
|
||||
}
|
||||
}
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user