Compare commits

..

3 Commits

313 changed files with 52736 additions and 67524 deletions

View File

@ -12,7 +12,7 @@ FROM ${BASE_CUDA_DEV_CONTAINER} as build
ARG CUDA_DOCKER_ARCH=all
RUN apt-get update && \
apt-get install -y build-essential git cmake libsdl2-dev wget
apt-get install -y build-essential git cmake libsdl2-dev
WORKDIR /app
@ -23,6 +23,6 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
# Enable cuBLAS
ENV GGML_CUDA=1
RUN make base.en
RUN make
ENTRYPOINT ["/app/main"]

View File

@ -17,7 +17,7 @@ ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
ENV GGML_CUDA=1
RUN apt-get update && \
apt-get install -y build-essential libsdl2-dev wget cmake \
apt-get install -y build-essential libsdl2-dev \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
# Ref: https://stackoverflow.com/a/53464012
@ -25,7 +25,7 @@ ENV CUDA_MAIN_VERSION=12.3
ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
COPY .. .
RUN make base.en
RUN make
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
ENV CUDA_MAIN_VERSION=12.3
@ -33,7 +33,7 @@ ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
WORKDIR /app
RUN apt-get update && \
apt-get install -y curl ffmpeg wget cmake \
apt-get install -y curl ffmpeg \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
COPY --from=build /app /app

View File

@ -2,17 +2,17 @@ FROM ubuntu:22.04 AS build
WORKDIR /app
RUN apt-get update && \
apt-get install -y build-essential wget cmake \
apt-get install -y build-essential \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
COPY .. .
RUN make base.en
RUN make
FROM ubuntu:22.04 AS runtime
WORKDIR /app
RUN apt-get update && \
apt-get install -y curl ffmpeg libsdl2-dev wget cmake \
apt-get install -y curl ffmpeg libsdl2-dev \
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
COPY --from=build /app /app

View File

@ -3,41 +3,61 @@ on:
push:
paths:
- bindings/ruby/**
- src/**/*.c
- src/**/*.cpp
- src/**/*.h
- src/**/*.m
- src/**/*.metal
- include/**/*.c
- include/**/*.cpp
- include/**/*.h
- include/**/*.m
- include/**/*.metal
- ggml/**/*.c
- ggml/**/*.cpp
- ggml/**/*.h
- ggml/**/*.m
- ggml/**/*.metal
- src/whisper.cpp
- include/whisper.h
- ggml/src/ggml.c
- ggml/src/ggml-impl.h
- ggml/src/ggml-aarch64.h
- ggml/src/ggml-aarch64.c
- ggml/src/ggml-alloc.c
- ggml/src/ggml-backend-impl.h
- ggml/src/ggml-backend.cpp
- ggml/src/ggml-common.h
- ggml/src/ggml-quants.h
- ggml/src/ggml-quants.c
- ggml/src/ggml-cpu-impl.h
- ggml/src/ggml-metal.m
- ggml/src/ggml-metal.metal
- ggml/src/ggml-blas.cpp
- ggml/include/ggml.h
- ggml/include/ggml-alloc.h
- ggml/include/ggml-backend.h
- ggml/include/ggml-cuda.h
- ggml/include/ggml-kompute.h
- ggml/include/ggml-metal.h
- ggml/include/ggml-sycl.h
- ggml/include/ggml-vulkan.h
- ggml/include/ggml-blas.h
- scripts/get-flags.mk
- examples/dr_wav.h
pull_request:
paths:
- bindings/ruby/**
- src/**/*.c
- src/**/*.cpp
- src/**/*.h
- src/**/*.m
- src/**/*.metal
- include/**/*.c
- include/**/*.cpp
- include/**/*.h
- include/**/*.m
- include/**/*.metal
- ggml/**/*.c
- ggml/**/*.cpp
- ggml/**/*.h
- ggml/**/*.m
- ggml/**/*.metal
- src/whisper.cpp
- include/whisper.h
- ggml/src/ggml.c
- ggml/src/ggml-impl.h
- ggml/src/ggml-aarch64.h
- ggml/src/ggml-aarch64.c
- ggml/src/ggml-alloc.c
- ggml/src/ggml-backend-impl.h
- ggml/src/ggml-backend.cpp
- ggml/src/ggml-common.h
- ggml/src/ggml-quants.h
- ggml/src/ggml-quants.c
- ggml/src/ggml-cpu-impl.h
- ggml/src/ggml-metal.m
- ggml/src/ggml-metal.metal
- ggml/src/ggml-blas.cpp
- ggml/include/ggml.h
- ggml/include/ggml-alloc.h
- ggml/include/ggml-backend.h
- ggml/include/ggml-cuda.h
- ggml/include/ggml-kompute.h
- ggml/include/ggml-metal.h
- ggml/include/ggml-sycl.h
- ggml/include/ggml-vulkan.h
- ggml/include/ggml-blas.h
- scripts/get-flags.mk
- examples/dr_wav.h
@ -50,6 +70,6 @@ jobs:
steps:
- uses: ruby/setup-ruby@v1
with:
ruby-version: '3.1'
ruby-version: '3.0'
- uses: actions/checkout@v4
- run: rake test

View File

@ -3,7 +3,6 @@ on: [push, pull_request]
env:
ubuntu_image: "ubuntu:22.04"
VCPKG_BINARY_SOURCES: "clear;x-gha,readwrite"
jobs:
ubuntu-latest:
@ -28,9 +27,9 @@ jobs:
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
set -e
apt update
apt install -y build-essential libsdl2-dev cmake
cmake -B build
cmake --build build --config Release -j $(nproc)'
apt install -y build-essential libsdl2-dev
make
make stream'
macOS-latest:
runs-on: macOS-latest
@ -42,30 +41,30 @@ jobs:
- name: Dependencies
run: |
brew update
brew install sdl2 cmake
brew install sdl2
- name: Build
run: |
cmake -B build
cmake --build build --config Release
make
make stream
# freeBSD-latest:
# runs-on: macos-12
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Build
# uses: cross-platform-actions/action@v0.24.0
# with:
# operating_system: freebsd
# version: '13.3'
# run: |
# sudo pkg update
# sudo pkg install -y gmake sdl2 cmake
# cmake -B build
# cmake --build build --config Release
freeBSD-latest:
runs-on: macos-12
steps:
- name: Clone
uses: actions/checkout@v4
- name: Build
uses: cross-platform-actions/action@v0.24.0
with:
operating_system: freebsd
version: '13.3'
run: |
sudo pkg update
sudo pkg install -y gmake sdl2
gmake
gmake stream
ubuntu-latest-gcc:
runs-on: ubuntu-latest
@ -280,6 +279,21 @@ jobs:
mingw-w64-${{matrix.env}}-SDL2
mingw-w64-${{matrix.env}}-openblas
- name: Build using make
shell: msys2 {0}
run: |
make -j $(nproc)
- name: Clean after building using make
shell: msys2 {0}
run: |
make clean
- name: Build using make w/ OpenBLAS
shell: msys2 {0}
run: |
make GGML_OPENBLAS=1 -j $(nproc)
- name: Build using CMake
shell: msys2 {0}
run: |
@ -294,7 +308,7 @@ jobs:
- name: Build using CMake w/ OpenBLAS
shell: msys2 {0}
run: |
cmake -B build -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
cmake -B build -DGGML_OPENBLAS=ON
cmake --build build --config ${{ matrix.build }} -j $(nproc)
windows:
@ -368,8 +382,10 @@ jobs:
sdl2: [ON]
include:
- arch: Win32
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
s2arc: x86
- arch: x64
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
s2arc: x64
- sdl2: ON
s2ver: 2.28.5
@ -378,21 +394,17 @@ jobs:
- name: Clone
uses: actions/checkout@v4
- name: Export GitHub Actions cache environment variables
uses: actions/github-script@v7
with:
script: |
core.exportVariable('ACTIONS_CACHE_URL', process.env.ACTIONS_CACHE_URL || '');
core.exportVariable('ACTIONS_RUNTIME_TOKEN', process.env.ACTIONS_RUNTIME_TOKEN || '');
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v2
- name: Install OpenBLAS and pkgconfiglite
- name: Fetch OpenBLAS
if: matrix.blas == 'ON'
run: |
vcpkg install --triplet=${{ matrix.s2arc }}-windows openblas
choco install pkgconfiglite
C:/msys64/usr/bin/wget.exe -qO blas.zip ${{ matrix.obzip }}
7z x blas.zip -oblas -y
copy blas/include/cblas.h .
copy blas/include/openblas_config.h .
echo "OPENBLAS_PATH=$env:GITHUB_WORKSPACE/blas" >> $env:GITHUB_ENV
- name: Fetch SDL2 and set SDL2_DIR
if: matrix.sdl2 == 'ON'
@ -404,10 +416,9 @@ jobs:
- name: Configure
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_TOOLCHAIN_FILE="$env:VCPKG_INSTALLATION_ROOT/scripts/buildsystems/vcpkg.cmake"
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DGGML_BLAS=${{ matrix.blas }}
-DGGML_BLAS_VENDOR=OpenBLAS
-DGGML_OPENBLAS=${{ matrix.blas }}
-DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
-DWHISPER_SDL2=${{ matrix.sdl2 }}
- name: Build
@ -415,9 +426,9 @@ jobs:
cd ./build
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
- name: Copy openblas.dll
- name: Copy libopenblas.dll
if: matrix.blas == 'ON'
run: copy "C:/vcpkg/packages/openblas_${{ matrix.s2arc }}-windows/bin/openblas.dll" build/bin/${{ matrix.build }}
run: copy "$env:OPENBLAS_PATH/bin/libopenblas.dll" build/bin/${{ matrix.build }}
- name: Copy SDL2.dll
if: matrix.sdl2 == 'ON'
@ -430,72 +441,71 @@ jobs:
name: whisper-blas-bin-${{ matrix.arch }}
path: build/bin/${{ matrix.build }}
# TODO: fix and re-enable
# windows-cublas:
# runs-on: windows-2019
#
# strategy:
# matrix:
# build: [Release]
# arch: [x64]
# cublas: [ON]
# sdl2: [ON]
# cuda-toolkit: [12.2.0, 11.8.0]
# include:
# - arch: x64
# s2arc: x64
# - sdl2: ON
# s2ver: 2.28.5
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
#
# - name: Add msbuild to PATH
# uses: microsoft/setup-msbuild@v2
#
# - name: Install CUDA Toolkit
# id: cuda-toolkit
# uses: Jimver/cuda-toolkit@v0.2.15
# with:
# cuda: '${{ matrix.cuda-toolkit }}'
#
# - name: Fetch SDL2 and set SDL2_DIR
# if: matrix.sdl2 == 'ON'
# run: |
# C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
# 7z x sdl2.zip
# echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
#
# - name: Configure
# run: >
# cmake -S . -B ./build -A ${{ matrix.arch }}
# -DCMAKE_BUILD_TYPE=${{ matrix.build }}
# -DGGML_CUDA=${{ matrix.cublas }}
# -DWHISPER_SDL2=${{ matrix.sdl2 }}
#
# - name: Build ${{ matrix.cuda-toolkit }}
# run: |
# cd ./build
# cmake --build . --config ${{ matrix.build }}
#
# - name: Copy CUDA DLLs
# run: >
# Copy-Item -PassThru
# -Path "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/*.dll"
# -Include cudart64_*,cublas64_*,cublasLt64_*
# -Destination build/bin/${{ matrix.build }}
#
# - name: Copy SDL2.dll
# if: matrix.sdl2 == 'ON'
# run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
#
# - name: Upload binaries
# if: matrix.sdl2 == 'ON'
# uses: actions/upload-artifact@v4
# with:
# name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}
# path: build/bin/${{ matrix.build }}
windows-cublas:
runs-on: windows-2019
strategy:
matrix:
build: [Release]
arch: [x64]
cublas: [ON]
sdl2: [ON]
cuda-toolkit: [12.2.0, 11.8.0]
include:
- arch: x64
s2arc: x64
- sdl2: ON
s2ver: 2.28.5
steps:
- name: Clone
uses: actions/checkout@v4
- name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v2
- name: Install CUDA Toolkit
id: cuda-toolkit
uses: Jimver/cuda-toolkit@v0.2.15
with:
cuda: '${{ matrix.cuda-toolkit }}'
- name: Fetch SDL2 and set SDL2_DIR
if: matrix.sdl2 == 'ON'
run: |
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
7z x sdl2.zip
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
- name: Configure
run: >
cmake -S . -B ./build -A ${{ matrix.arch }}
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
-DGGML_CUDA=${{ matrix.cublas }}
-DWHISPER_SDL2=${{ matrix.sdl2 }}
- name: Build ${{ matrix.cuda-toolkit }}
run: |
cd ./build
cmake --build . --config ${{ matrix.build }}
- name: Copy CUDA DLLs
run: >
Copy-Item -PassThru
-Path "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/*.dll"
-Include cudart64_*,cublas64_*,cublasLt64_*
-Destination build/bin/${{ matrix.build }}
- name: Copy SDL2.dll
if: matrix.sdl2 == 'ON'
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
- name: Upload binaries
if: matrix.sdl2 == 'ON'
uses: actions/upload-artifact@v4
with:
name: whisper-cublas-${{ matrix.cuda-toolkit }}-bin-${{ matrix.arch }}
path: build/bin/${{ matrix.build }}
emscripten:
runs-on: ubuntu-latest
@ -519,7 +529,7 @@ jobs:
emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
make
ios-xcode-build:
ios:
runs-on: macos-latest
strategy:
@ -527,7 +537,7 @@ jobs:
build: [Release]
steps:
- name: Checkout code
- name: Clone
uses: actions/checkout@v4
- name: Configure
@ -535,64 +545,46 @@ jobs:
cp models/for-tests-ggml-base.en.bin models/ggml-base.en.bin
mkdir models/ggml-base.en-encoder.mlmodelc
- name: Build
id: cmake_build
run: |
sysctl -a
mkdir build
cd build
cmake -G Xcode .. \
-DGGML_METAL_USE_BF16=ON \
-DGGML_METAL_EMBED_LIBRARY=ON \
-DWHISPER_BUILD_EXAMPLES=OFF \
-DWHISPER_BUILD_TESTS=OFF \
-DWHISPER_BUILD_SERVER=OFF \
-DCMAKE_SYSTEM_NAME=iOS \
-DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-DCMAKE_XCODE_ATTRIBUTE_DEVELOPMENT_TEAM=ggml
cmake --build . --config Release -j $(sysctl -n hw.logicalcpu) -- CODE_SIGNING_ALLOWED=NO
sudo cmake --install . --config Release
- name: xcodebuild for swift package
id: xcodebuild
run: |
xcodebuild -scheme whisper-Package -destination 'generic/platform=iOS'
#- name: Build objc example
# run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphoneos build
- name: Build objc example
run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphonesimulator build
- name: Build swiftui example
run: xcodebuild -project examples/whisper.swiftui/whisper.swiftui.xcodeproj -scheme WhisperCppDemo -configuration ${{ matrix.build }} -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' build
run: xcodebuild -project examples/whisper.swiftui/whisper.swiftui.xcodeproj -scheme WhisperCppDemo -configuration ${{ matrix.build }} -sdk iphonesimulator build
# TODO: update android build and re-enable when it works
# android:
# runs-on: ubuntu-latest
#
# steps:
# - name: Clone
# uses: actions/checkout@v4
# with:
# path: whisper
#
# - name: Install Java
# uses: actions/setup-java@v4
# with:
# distribution: zulu
# java-version: 21
#
# - name: Setup Android SDK
# uses: android-actions/setup-android@v3
#
# - name: Build
# run: |
# cd whisper/examples/whisper.android
# ./gradlew assembleRelease --no-daemon
#
# - name: Build with external ggml
# run: |
# export PATH_TO_GGML=$PWD/ggml
# cd whisper/examples/whisper.android
# ./gradlew assembleRelease --no-daemon
android:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v4
with:
path: whisper
- name: Clone
uses: actions/checkout@v4
with:
repository: ggerganov/ggml
path: ggml
- name: Install Java
uses: actions/setup-java@v4
with:
distribution: zulu
java-version: 21
- name: Setup Android SDK
uses: android-actions/setup-android@v3
- name: Build
run: |
cd whisper/examples/whisper.android
./gradlew assembleRelease --no-daemon
- name: Build with external ggml
run: |
export PATH_TO_GGML=$PWD/ggml
cd whisper/examples/whisper.android
./gradlew assembleRelease --no-daemon -PGGML_HOME=$PATH_TO_GGML
# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
# android_java:
@ -674,6 +666,5 @@ jobs:
- name: Test quantize
run: |
./models/download-ggml-model.sh tiny.en
cmake -B build
cmake --build build --config Release
./build/bin/quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0
make quantize
./quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0

View File

@ -45,7 +45,7 @@ jobs:
with:
context: .
push: true
platforms: ${{ matrix.config.platform }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
file: ${{ matrix.config.dockerfile }}
@ -54,6 +54,6 @@ jobs:
with:
context: .
push: ${{ github.event_name == 'push' }}
platforms: ${{ matrix.config.platform }}
platforms: ${{ matrix.config.platforms }}
tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
file: ${{ matrix.config.dockerfile }}

4
.gitignore vendored
View File

@ -1,6 +1,5 @@
*.o
*.a
*.d
.cache/
.coreml/
.test/
@ -20,9 +19,6 @@ build-*/
.swiftpm
*.metallib
ggml-metal-embed.metal
ggml-metal-embed.metal.tmp
/main
/stream
/command

View File

@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
project("whisper.cpp" C CXX)
project("whisper.cpp" VERSION 1.7.2)
project("whisper.cpp" VERSION 1.7.1)
include(CheckIncludeFileCXX)
set(SOVERSION 1)

1131
Makefile

File diff suppressed because it is too large Load Diff

View File

@ -14,6 +14,47 @@ let package = Package(
.library(name: "whisper", targets: ["whisper"]),
],
targets: [
.systemLibrary(name: "whisper", pkgConfig: "whisper"),
]
.target(
name: "whisper",
path: ".",
exclude: [
"bindings",
"cmake",
"coreml",
"examples",
"extra",
"models",
"samples",
"tests",
"CMakeLists.txt",
"Makefile"
],
sources: [
"ggml/src/ggml.c",
"src/whisper.cpp",
"ggml/src/ggml-aarch64.c",
"ggml/src/ggml-alloc.c",
"ggml/src/ggml-backend.cpp",
"ggml/src/ggml-quants.c",
"ggml/src/ggml-metal.m"
],
resources: [.process("ggml-metal.metal")],
publicHeadersPath: "spm-headers",
cSettings: [
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
.define("GGML_USE_ACCELERATE"),
.unsafeFlags(["-fno-objc-arc"]),
.define("GGML_USE_METAL")
// NOTE: NEW_LAPACK will required iOS version 16.4+
// We should consider add this in the future when we drop support for iOS 14
// (ref: ref: https://developer.apple.com/documentation/accelerate/1513264-cblas_sgemm?language=objc)
// .define("ACCELERATE_NEW_LAPACK"),
// .define("ACCELERATE_LAPACK_ILP64")
],
linkerSettings: [
.linkedFramework("Accelerate")
]
)
],
cxxLanguageStandard: .cxx11
)

View File

@ -7,7 +7,7 @@
[![Conan Center](https://shields.io/conan/v/whisper-cpp)](https://conan.io/center/whisper-cpp)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
Stable: [v1.7.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.2) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
Stable: [v1.7.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -16,7 +16,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
- AVX intrinsics support for x86 architectures
- VSX intrinsics support for POWER architectures
- Mixed F16 / F32 precision
- [Integer quantization support](#quantization)
- [4-bit and 5-bit integer quantization support](#quantization)
- Zero memory allocations at runtime
- [Vulkan support](#vulkan-gpu-support)
- Support for CPU-only inference
@ -89,11 +89,10 @@ Now build the [main](examples/main) example and transcribe an audio file like th
```bash
# build the main example
cmake -B build
cmake --build build --config Release
make -j
# transcribe an audio file
./build/bin/main -f samples/jfk.wav
./main -f samples/jfk.wav
```
---
@ -266,12 +265,11 @@ Here are the steps for creating and using a quantized model:
```bash
# quantize a model with Q5_0 method
cmake -B build
cmake --build build --config Release
./build/bin/quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
make -j quantize
./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
# run the examples as usual, specifying the quantized model file
./build/bin/main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
./main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
```
## Core ML support
@ -305,6 +303,10 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
- Build `whisper.cpp` with Core ML support:
```bash
# using Makefile
make clean
WHISPER_COREML=1 make -j
# using CMake
cmake -B build -DWHISPER_COREML=1
cmake --build build -j --config Release
@ -424,8 +426,8 @@ First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-do
Now build `whisper.cpp` with CUDA support:
```
cmake -B build -DGGML_CUDA=1
cmake --build build -j --config Release
make clean
GGML_CUDA=1 make -j
```
## Vulkan GPU support
@ -434,8 +436,8 @@ First, make sure your graphics card driver provides support for Vulkan API.
Now build `whisper.cpp` with Vulkan support:
```
cmake -B build -DGGML_VULKAN=1
cmake --build build -j --config Release
make clean
make GGML_VULKAN=1 -j
```
## BLAS CPU support via OpenBLAS
@ -446,13 +448,28 @@ First, make sure you have installed `openblas`: https://www.openblas.net/
Now build `whisper.cpp` with OpenBLAS support:
```
cmake -B build -DGGML_BLAS=1
cmake --build build -j --config Release
make clean
GGML_OPENBLAS=1 make -j
```
## BLAS CPU support via Intel MKL
Encoder processing can be accelerated on the CPU via the BLAS compatible interface of Intel's Math Kernel Library.
First, make sure you have installed Intel's MKL runtime and development packages: https://www.intel.com/content/www/us/en/developer/tools/oneapi/onemkl-download.html
Now build `whisper.cpp` with Intel MKL BLAS support:
```
source /opt/intel/oneapi/setvars.sh
mkdir build
cd build
cmake -DWHISPER_MKL=ON ..
WHISPER_MKL=1 make -j
```
## Ascend NPU support
Ascend NPU provides inference acceleration via [`CANN`](https://www.hiascend.com/en/software/cann) and AI cores.
Ascend NPU provides inference acceleration via [`CANN`](https://www.hiascend.com/en/software/cann) and AI cores.
First, check if your Ascend NPU device is supported:
@ -466,8 +483,10 @@ Then, make sure you have installed [`CANN toolkit`](https://www.hiascend.com/en/
Now build `whisper.cpp` with CANN support:
```
cmake -B build -DGGML_CANN=1
cmake --build build -j --config Release
mkdir build
cd build
cmake .. -D GGML_CANN=on
make -j
```
Run the inference examples as usual, for example:
@ -617,9 +636,8 @@ The [stream](examples/stream) tool samples the audio every half a second and run
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
```bash
cmake -B build
cmake --build build --config Release
./build/bin/stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
make stream -j
./stream -m ./models/ggml-base.en.bin -t 8 --step 500 --length 5000
```
https://user-images.githubusercontent.com/1991296/194935793-76afede7-cfa8-48d8-a80f-28ba83be7d09.mp4

View File

@ -1,5 +0,0 @@
module whisper [system] {
header "whisper.h"
link "whisper"
export *
}

View File

@ -1,4 +0,0 @@
#pragma once
#include <whisper.h>

View File

@ -1,6 +1,6 @@
{
"name": "whisper.cpp",
"version": "1.7.2",
"version": "1.7.1",
"description": "Whisper speech recognition",
"main": "whisper.js",
"scripts": {

View File

@ -31,7 +31,7 @@ params.duration = 60_000
params.max_text_tokens = 300
params.translate = true
params.print_timestamps = false
params.initial_prompt = "Initial prompt here."
params.prompt = "Initial prompt here."
whisper.transcribe("path/to/audio.wav", params) do |whole_text|
puts whole_text
@ -107,81 +107,5 @@ whisper.transcribe("path/to/audio.wav", params)
```
You can see model information:
```ruby
whisper = Whisper::Context.new("path/to/model.bin")
model = whisper.model
model.n_vocab # => 51864
model.n_audio_ctx # => 1500
model.n_audio_state # => 512
model.n_audio_head # => 8
model.n_audio_layer # => 6
model.n_text_ctx # => 448
model.n_text_state # => 512
model.n_text_head # => 8
model.n_text_layer # => 6
model.n_mels # => 80
model.ftype # => 1
model.type # => "base"
```
You can set log callback:
```ruby
prefix = "[MyApp] "
log_callback = ->(level, buffer, user_data) {
case level
when Whisper::LOG_LEVEL_NONE
puts "#{user_data}none: #{buffer}"
when Whisper::LOG_LEVEL_INFO
puts "#{user_data}info: #{buffer}"
when Whisper::LOG_LEVEL_WARN
puts "#{user_data}warn: #{buffer}"
when Whisper::LOG_LEVEL_ERROR
puts "#{user_data}error: #{buffer}"
when Whisper::LOG_LEVEL_DEBUG
puts "#{user_data}debug: #{buffer}"
when Whisper::LOG_LEVEL_CONT
puts "#{user_data}same to previous: #{buffer}"
end
}
Whisper.log_set log_callback, prefix
```
Using this feature, you are also able to suppress log:
```ruby
Whisper.log_set ->(level, buffer, user_data) {
# do nothing
}, nil
Whisper::Context.new(MODEL)
```
You can also call `Whisper::Context#full` and `#full_parallel` with a Ruby array as samples. Although `#transcribe` with audio file path is recommended because it extracts PCM samples in C++ and is fast, `#full` and `#full_parallel` give you flexibility.
```ruby
require "whisper"
require "wavefile"
reader = WaveFile::Reader.new("path/to/audio.wav", WaveFile::Format.new(:mono, :float, 16000))
samples = reader.enum_for(:each_buffer).map(&:samples).flatten
whisper = Whisper::Context.new("path/to/model.bin")
whisper.full(Whisper::Params.new, samples)
whisper.each_segment do |segment|
puts segment.text
end
```
The second argument `samples` may be an array, an object with `length` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.
License
-------
The same to [whisper.cpp][].
[whisper.cpp]: https://github.com/ggerganov/whisper.cpp
[models]: https://github.com/ggerganov/whisper.cpp/tree/master/models

View File

@ -1,22 +1,20 @@
require 'rake/clean'
require "bundler/gem_tasks"
require "pathname"
require "yaml"
require "rake/testtask"
require_relative "extsources"
extsources = YAML.load_file("extsources.yaml")
SOURCES = FileList[]
EXTSOURCES.each do |src|
extsources.each do |src|
basename = src.pathmap("%f")
dest = basename == "LICENSE" ? basename : src.pathmap("%{../..,ext}p")
dir = dest.pathmap("%d")
dest = basename == "LICENSE" ? basename : basename.pathmap("ext/%f")
file src
directory dir
file dest => [src, dir] do |t|
file dest => src do |t|
cp t.source, t.name
end
SOURCES.include dest
end
CLEAN.include SOURCES
CLEAN.include FileList[
"ext/*.o",
@ -25,39 +23,30 @@ CLEAN.include FileList[
"ext/depend"
]
task build: FileList[
"ext/Makefile",
"ext/ruby_whisper.h",
"ext/ruby_whisper.cpp",
"whispercpp.gemspec",
]
task build: SOURCES + FileList[
"ext/extconf.rb",
"ext/ruby_whisper.h",
"ext/ruby_whisper.cpp",
"whispercpp.gemspec",
]
directory "pkg"
CLOBBER.include "pkg"
TEST_MODEL = "../../models/ggml-base.en.bin"
LIB_NAME = "whisper".ext(RbConfig::CONFIG["DLEXT"])
SO_FILE = File.join("ext", LIB_NAME)
LIB_FILE = File.join("lib", LIB_NAME)
file "ext/Makefile" => ["ext/extconf.rb", "ext/ruby_whisper.h", "ext/ruby_whisper.cpp"] + SOURCES do |t|
Dir.chdir "ext" do
ruby "extconf.rb"
end
end
file SO_FILE => "ext/Makefile" do |t|
directory "lib"
task LIB_FILE => SOURCES + ["lib"] do |t|
Dir.chdir "ext" do
sh "ruby extconf.rb"
sh "make"
end
mv "ext/#{LIB_NAME}", t.name
end
CLEAN.include LIB_FILE
directory "lib"
file LIB_FILE => [SO_FILE, "lib"] do |t|
copy t.source, t.name
end
Rake::TestTask.new do |t|
t.test_files = FileList["tests/test_*.rb"]
end
@ -68,13 +57,3 @@ file TEST_MODEL do
sh "./models/download-ggml-model.sh base.en"
end
end
TEST_MEMORY_VIEW = "tests/jfk_reader/jfk_reader.#{RbConfig::CONFIG['DLEXT']}"
file TEST_MEMORY_VIEW => "tests/jfk_reader/jfk_reader.c" do |t|
Dir.chdir "tests/jfk_reader" do
ruby "extconf.rb"
sh "make"
end
end
CLEAN.include "tests/jfk_reader/jfk_reader.{o,#{RbConfig::CONFIG['DLEXT']}}"
task test: TEST_MEMORY_VIEW

View File

@ -1,14 +1,35 @@
Makefile
whisper.so
whisper.bundle
whisper.dll
ggml.c
ggml.h
ggml-alloc.c
ggml-alloc.h
ggml-aarch64.c
ggml-aarch64.h
ggml-backend.cpp
ggml-backend-impl.h
ggml-backend.c
ggml-backend.h
ggml-common.h
ggml-cpu-impl.h
ggml-metal.m
ggml-metal.metal
ggml-metal-embed.metal
ggml-blas.cpp
ggml-cuda.h
ggml-impl.h
ggml-kompute.h
ggml-metal.h
ggml-opencl.h
ggml-quants.c
ggml-quants.h
ggml-sycl.h
ggml-vulkan.h
ggml-blas.h
get-flags.mk
whisper.cpp
whisper.h
dr_wav.h
depend
scripts/get-flags.mk
*.o
*.c
*.cpp
*.h
*.m
*.metal
!ruby_whisper.cpp
!ruby_whisper.h
whisper.bundle
whisper.so
whisper.dll

View File

@ -1,9 +0,0 @@
ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
ggml/src/ggml-cpu/ggml-cpu.cpp \
ggml/include/ggml-backend.h \
ggml/include/ggml.h \
ggml/include/ggml-alloc.h \
ggml/src/ggml-backend-impl.h \
ggml/include/ggml-cpu.h \
ggml/src/ggml-impl.h
$(CXX) $(CXXFLAGS) -c $< -o $@

View File

@ -2,9 +2,6 @@ require 'mkmf'
# need to use c++ compiler flags
$CXXFLAGS << ' -std=c++11'
$LDFLAGS << ' -lstdc++'
# Set to true when building binary gems
if enable_config('static-stdlib', false)
$LDFLAGS << ' -static-libgcc -static-libstdc++'
@ -15,6 +12,34 @@ if enable_config('march-tune-native', false)
$CXXFLAGS << ' -march=native -mtune=native'
end
def with_disabling_unsupported_files
disabled_files = []
unless $GGML_METAL
disabled_files << 'ggml-metal.h' << 'ggml-metal.m'
end
unless $GGML_METAL_EMBED_LIBRARY
disabled_files << 'ggml-metal.metal'
end
unless $OBJ_ALL&.include? 'ggml-blas.o'
disabled_files << 'ggml-blas.h' << 'ggml-blas.cpp'
end
disabled_files.filter! {|file| File.exist? file}
disabled_files.each do |file|
File.rename file, "#{file}.disabled"
end
yield
disabled_files.each do |file|
File.rename "#{file}.disabled", file
end
end
if ENV['WHISPER_METAL']
$GGML_METAL ||= true
$DEPRECATE_WARNING ||= true
@ -35,16 +60,16 @@ if $GGML_METAL
$GGML_METAL_EMBED_LIBRARY = true
end
$MK_CPPFLAGS = '-Iggml/include -Iggml/src -Iinclude -Isrc -Iexamples'
$MK_CPPFLAGS = ''
$MK_CFLAGS = '-std=c11 -fPIC'
$MK_CXXFLAGS = '-std=c++11 -fPIC'
$MK_NVCCFLAGS = '-std=c++11'
$MK_LDFLAGS = ''
$OBJ_GGML = []
$OBJ_WHISPER = []
$OBJ_COMMON = []
$OBJ_SDL = []
$OBJ_GGML = ''
$OBJ_WHISPER = ''
$OBJ_COMMON = ''
$OBJ_SDL = ''
$MK_CPPFLAGS << ' -D_XOPEN_SOURCE=600'
@ -123,11 +148,11 @@ end
unless ENV['GGML_NO_ACCELERATE']
if $UNAME_S == 'Darwin'
$MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS -DGGML_BLAS_USE_ACCELERATE'
$MK_CPPFLAGS << ' -DGGML_USE_ACCELERATE -DGGML_USE_BLAS'
$MK_CPPFLAGS << ' -DACCELERATE_NEW_LAPACK'
$MK_CPPFLAGS << ' -DACCELERATE_LAPACK_ILP64'
$MK_LDFLAGS << ' -framework Accelerate'
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
$OBJ_GGML << ' ggml-blas.o'
end
end
@ -135,20 +160,20 @@ if ENV['GGML_OPENBLAS']
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas`.chomp}"
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas)`.chomp}"
$MK_LDFLAGS << " #{`pkg-config --libs openblas`}"
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
$OBJ_GGML << ' ggml-blas.o'
end
if ENV['GGML_OPENBLAS64']
$MK_CPPFLAGS << " -DGGML_USE_BLAS #{`pkg-config --cflags-only-I openblas64`.chomp}"
$MK_CFLAGS << " #{`pkg-config --cflags-only-other openblas64)`.chomp}"
$MK_LDFLAGS << " #{`pkg-config --libs openblas64`}"
$OBJ_GGML << 'ggml/src/ggml-blas/ggml-blas.o'
$OBJ_GGML << ' ggml-blas.o'
end
if $GGML_METAL
$MK_CPPFLAGS << ' -DGGML_USE_METAL'
$MK_LDFLAGS << ' -framework Foundation -framework Metal -framework MetalKit'
$OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal.o'
$OBJ_GGML << ' ggml-metal.o'
if ENV['GGML_METAL_NDEBUG']
$MK_CPPFLAGS << ' -DGGML_METAL_NDEBUG'
@ -156,29 +181,21 @@ if $GGML_METAL
if $GGML_METAL_EMBED_LIBRARY
$MK_CPPFLAGS << ' -DGGML_METAL_EMBED_LIBRARY'
$OBJ_GGML << 'ggml/src/ggml-metal/ggml-metal-embed.o'
$OBJ_GGML << ' ggml-metal-embed.o'
end
end
$OBJ_GGML <<
'ggml/src/ggml.o' <<
'ggml/src/ggml-aarch64.o' <<
'ggml/src/ggml-alloc.o' <<
'ggml/src/ggml-backend.o' <<
'ggml/src/ggml-backend-reg.o' <<
'ggml/src/ggml-opt.o' <<
'ggml/src/ggml-quants.o' <<
'ggml/src/ggml-threading.o' <<
'ggml/src/ggml-cpu/ggml-cpu.o' <<
'ggml/src/ggml-cpu/ggml-cpu-cpp.o' <<
'ggml/src/ggml-cpu/ggml-cpu-aarch64.o' <<
'ggml/src/ggml-cpu/ggml-cpu-quants.o'
' ggml.o' <<
' ggml-alloc.o' <<
' ggml-backend.o' <<
' ggml-quants.o' <<
' ggml-aarch64.o'
$OBJ_WHISPER <<
'src/whisper.o'
' whisper.o'
$objs = $OBJ_GGML + $OBJ_WHISPER + $OBJ_COMMON + $OBJ_SDL
$objs << "ruby_whisper.o"
$OBJ_ALL = "#{$OBJ_GGML} #{$OBJ_WHISPER} #{$OBJ_COMMON} #{$OBJ_SDL}"
$CPPFLAGS = "#{$MK_CPPFLAGS} #{$CPPFLAGS}"
$CFLAGS = "#{$CPPFLAGS} #{$MK_CFLAGS} #{$GF_CFLAGS} #{$CFLAGS}"
@ -187,16 +204,26 @@ $CXXFLAGS = "#{$BASE_CXXFLAGS} #{$HOST_CXXFLAGS} #{$GF_CXXFLAGS} #{$CPPFLAGS}"
$NVCCFLAGS = "#{$MK_NVCCFLAGS} #{$NVCCFLAGS}"
$LDFLAGS = "#{$MK_LDFLAGS} #{$LDFLAGS}"
create_makefile('whisper')
if $GGML_METAL_EMBED_LIBRARY
File.write 'depend', "$(OBJS): $(OBJS) ggml-metal-embed.o\n"
end
with_disabling_unsupported_files do
create_makefile('whisper')
end
File.open 'Makefile', 'a' do |file|
file.puts 'include scripts/get-flags.mk'
file.puts 'include cpu.mk'
file.puts 'include get-flags.mk'
if $GGML_METAL
file.puts 'include metal.mk'
if $GGML_METAL_EMBED_LIBRARY
# mkmf determines object files to compile dependent on existing *.{c,cpp,m} files
# but ggml-metal-embed.c doesn't exist on creating Makefile.
file.puts "objs := $(OBJS)"
file.puts "OBJS = $(objs) 'ggml-metal-embed.o'"
file.puts 'include metal-embed.mk'
end
end

View File

@ -1,17 +1,14 @@
ggml/src/ggml-metal/ggml-metal-embed.o: \
ggml/src/ggml-metal/ggml-metal.metal \
ggml/src/ggml-metal/ggml-metal-impl.h \
ggml/src/ggml-common.h
ggml-metal-embed.o: \
ggml-metal.metal \
ggml-common.h
@echo "Embedding Metal library"
@sed -e '/__embed_ggml-common.h__/r ggml/src/ggml-common.h' -e '/__embed_ggml-common.h__/d' < ggml/src/ggml-metal/ggml-metal.metal > ggml/src/ggml-metal/ggml-metal-embed.metal.tmp
@sed -e '/#include "ggml-metal-impl.h"/r ggml/src/ggml-metal/ggml-metal-impl.h' -e '/#include "ggml-metal-impl.h"/d' < ggml/src/ggml-metal/ggml-metal-embed.metal.tmp > ggml/src/ggml-metal/ggml-metal-embed.metal
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".incbin \"ggml/src/ggml-metal/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
@rmdir ${TEMP_ASSEMBLY}
@sed -e '/#include "ggml-common.h"/r ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml-metal.metal > ggml-metal-embed.metal
$(eval TEMP_ASSEMBLY=$(shell mktemp))
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
@echo ".incbin \"ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
@$(AS) $(TEMP_ASSEMBLY) -o $@
@rm -f ${TEMP_ASSEMBLY}

View File

@ -1,6 +0,0 @@
ggml/src/ggml-metal/ggml-metal.o: \
ggml/src/ggml-metal/ggml-metal.m \
ggml/src/ggml-metal/ggml-metal-impl.h \
ggml/include/ggml-metal.h \
ggml/include/ggml.h
$(CC) $(CFLAGS) -c $< -o $@

View File

@ -1,5 +1,4 @@
#include <ruby.h>
#include <ruby/memory_view.h>
#include "ruby_whisper.h"
#define DR_WAV_IMPLEMENTATION
#include "dr_wav.h"
@ -36,17 +35,11 @@ extern "C" {
VALUE mWhisper;
VALUE cContext;
VALUE cParams;
VALUE eError;
static ID id_to_s;
static ID id_call;
static ID id___method__;
static ID id_to_enum;
static ID id_length;
static ID id_next;
static ID id_new;
static bool is_log_callback_finalized = false;
/*
* call-seq:
@ -95,39 +88,6 @@ static VALUE ruby_whisper_s_lang_str_full(VALUE self, VALUE id) {
return rb_str_new2(str_full);
}
static VALUE ruby_whisper_s_finalize_log_callback(VALUE self, VALUE id) {
is_log_callback_finalized = true;
return Qnil;
}
/*
* call-seq:
* log_set ->(level, buffer, user_data) { ... }, user_data -> nil
*/
static VALUE ruby_whisper_s_log_set(VALUE self, VALUE log_callback, VALUE user_data) {
VALUE old_callback = rb_iv_get(self, "log_callback");
if (!NIL_P(old_callback)) {
rb_undefine_finalizer(old_callback);
}
rb_iv_set(self, "log_callback", log_callback);
rb_iv_set(self, "user_data", user_data);
VALUE finalize_log_callback = rb_funcall(mWhisper, rb_intern("method"), 1, rb_str_new2("finalize_log_callback"));
rb_define_finalizer(log_callback, finalize_log_callback);
whisper_log_set([](ggml_log_level level, const char * buffer, void * user_data) {
if (is_log_callback_finalized) {
return;
}
VALUE log_callback = rb_iv_get(mWhisper, "log_callback");
VALUE udata = rb_iv_get(mWhisper, "user_data");
rb_funcall(log_callback, id_call, 3, INT2NUM(level), rb_str_new2(buffer), udata);
}, nullptr);
return Qnil;
}
static void ruby_whisper_free(ruby_whisper *rw) {
if (rw->context) {
whisper_free(rw->context);
@ -429,288 +389,6 @@ static VALUE ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
return self;
}
/*
* call-seq:
* model_n_vocab -> Integer
*/
VALUE ruby_whisper_model_n_vocab(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_vocab(rw->context));
}
/*
* call-seq:
* model_n_audio_ctx -> Integer
*/
VALUE ruby_whisper_model_n_audio_ctx(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
}
/*
* call-seq:
* model_n_audio_state -> Integer
*/
VALUE ruby_whisper_model_n_audio_state(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_state(rw->context));
}
/*
* call-seq:
* model_n_audio_head -> Integer
*/
VALUE ruby_whisper_model_n_audio_head(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_head(rw->context));
}
/*
* call-seq:
* model_n_audio_layer -> Integer
*/
VALUE ruby_whisper_model_n_audio_layer(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_layer(rw->context));
}
/*
* call-seq:
* model_n_text_ctx -> Integer
*/
VALUE ruby_whisper_model_n_text_ctx(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_ctx(rw->context));
}
/*
* call-seq:
* model_n_text_state -> Integer
*/
VALUE ruby_whisper_model_n_text_state(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_state(rw->context));
}
/*
* call-seq:
* model_n_text_head -> Integer
*/
VALUE ruby_whisper_model_n_text_head(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_head(rw->context));
}
/*
* call-seq:
* model_n_text_layer -> Integer
*/
VALUE ruby_whisper_model_n_text_layer(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_layer(rw->context));
}
/*
* call-seq:
* model_n_mels -> Integer
*/
VALUE ruby_whisper_model_n_mels(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_n_mels(rw->context));
}
/*
* call-seq:
* model_ftype -> Integer
*/
VALUE ruby_whisper_model_ftype(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return INT2NUM(whisper_model_ftype(rw->context));
}
/*
* call-seq:
* model_type -> String
*/
VALUE ruby_whisper_model_type(VALUE self) {
ruby_whisper *rw;
Data_Get_Struct(self, ruby_whisper, rw);
return rb_str_new2(whisper_model_type_readable(rw->context));
}
/*
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
* Not thread safe for same context
* Uses the specified decoding strategy to obtain the text.
*
* call-seq:
* full(params, samples, n_samples) -> nil
* full(params, samples) -> nil
*
* The second argument +samples+ must be an array of samples, respond to :length, or be a MemoryView of an array of float. It must be 32 bit float PCM audio data.
*/
VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self) {
if (argc < 2 || argc > 3) {
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
}
ruby_whisper *rw;
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper, rw);
VALUE params = argv[0];
Data_Get_Struct(params, ruby_whisper_params, rwp);
VALUE samples = argv[1];
int n_samples;
rb_memory_view_t view;
const bool memory_view_available_p = rb_memory_view_available_p(samples);
if (argc == 3) {
n_samples = NUM2INT(argv[2]);
if (TYPE(samples) == T_ARRAY) {
if (RARRAY_LEN(samples) < n_samples) {
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples);
}
}
// Should check when samples.respond_to?(:length)?
} else {
if (TYPE(samples) == T_ARRAY) {
n_samples = RARRAY_LEN(samples);
} else if (memory_view_available_p) {
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
view.obj = Qnil;
rb_raise(rb_eArgError, "unable to get a memory view");
}
n_samples = view.byte_size / view.item_size;
} else if (rb_respond_to(samples, id_length)) {
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
} else {
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given");
}
}
float * c_samples = (float *)malloc(n_samples * sizeof(float));
if (memory_view_available_p) {
c_samples = (float *)view.data;
} else {
if (TYPE(samples) == T_ARRAY) {
for (int i = 0; i < n_samples; i++) {
c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i));
}
} else {
// TODO: use rb_block_call
VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each"));
for (int i = 0; i < n_samples; i++) {
// TODO: check if iter is exhausted and raise ArgumentError appropriately
VALUE sample = rb_funcall(iter, id_next, 0);
c_samples[i] = RFLOAT_VALUE(sample);
}
}
}
const int result = whisper_full(rw->context, rwp->params, c_samples, n_samples);
if (0 == result) {
return Qnil;
} else {
rb_exc_raise(rb_funcall(eError, id_new, 1, result));
}
}
/*
* Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
* Result is stored in the default state of the context
* Not thread safe if executed in parallel on the same context.
* It seems this approach can offer some speedup in some cases.
* However, the transcription accuracy can be worse at the beginning and end of each chunk.
*
* call-seq:
* full_parallel(params, samples) -> nil
* full_parallel(params, samples, n_samples) -> nil
* full_parallel(params, samples, n_samples, n_processors) -> nil
* full_parallel(params, samples, nil, n_processors) -> nil
*/
static VALUE ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self) {
if (argc < 2 || argc > 4) {
rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2..3)", argc);
}
ruby_whisper *rw;
ruby_whisper_params *rwp;
Data_Get_Struct(self, ruby_whisper, rw);
VALUE params = argv[0];
Data_Get_Struct(params, ruby_whisper_params, rwp);
VALUE samples = argv[1];
int n_samples;
int n_processors;
rb_memory_view_t view;
const bool memory_view_available_p = rb_memory_view_available_p(samples);
switch (argc) {
case 2:
n_processors = 1;
break;
case 3:
n_processors = 1;
break;
case 4:
n_processors = NUM2INT(argv[3]);
break;
}
if (argc >= 3 && !NIL_P(argv[2])) {
n_samples = NUM2INT(argv[2]);
if (TYPE(samples) == T_ARRAY) {
if (RARRAY_LEN(samples) < n_samples) {
rb_raise(rb_eArgError, "samples length %ld is less than n_samples %d", RARRAY_LEN(samples), n_samples);
}
}
// Should check when samples.respond_to?(:length)?
} else if (memory_view_available_p) {
if (!rb_memory_view_get(samples, &view, RUBY_MEMORY_VIEW_SIMPLE)) {
view.obj = Qnil;
rb_raise(rb_eArgError, "unable to get a memory view");
}
n_samples = view.byte_size / view.item_size;
} else {
if (TYPE(samples) == T_ARRAY) {
n_samples = RARRAY_LEN(samples);
} else if (rb_respond_to(samples, id_length)) {
n_samples = NUM2INT(rb_funcall(samples, id_length, 0));
} else {
rb_raise(rb_eArgError, "samples must respond to :length or be a MemoryView of an array of flaot when n_samples is not given");
}
}
float * c_samples = (float *)malloc(n_samples * sizeof(float));
if (memory_view_available_p) {
c_samples = (float *)view.data;
} else {
if (TYPE(samples) == T_ARRAY) {
for (int i = 0; i < n_samples; i++) {
c_samples[i] = RFLOAT_VALUE(rb_ary_entry(samples, i));
}
} else {
// FIXME: use rb_block_call
VALUE iter = rb_funcall(samples, id_to_enum, 1, rb_str_new2("each"));
for (int i = 0; i < n_samples; i++) {
// TODO: check if iter is exhausted and raise ArgumentError
VALUE sample = rb_funcall(iter, id_next, 0);
c_samples[i] = RFLOAT_VALUE(sample);
}
}
}
const int result = whisper_full_parallel(rw->context, rwp->params, c_samples, n_samples, n_processors);
if (0 == result) {
return Qnil;
} else {
rb_exc_raise(rb_funcall(eError, id_new, 1, result));
}
}
/*
* Number of segments.
*
@ -1337,12 +1015,7 @@ typedef struct {
int index;
} ruby_whisper_segment;
typedef struct {
VALUE context;
} ruby_whisper_model;
VALUE cSegment;
VALUE cModel;
static void rb_whisper_segment_mark(ruby_whisper_segment *rws) {
rb_gc_mark(rws->context);
@ -1515,268 +1188,31 @@ static VALUE ruby_whisper_segment_get_text(VALUE self) {
return rb_str_new2(text);
}
static void rb_whisper_model_mark(ruby_whisper_model *rwm) {
rb_gc_mark(rwm->context);
}
static VALUE ruby_whisper_model_allocate(VALUE klass) {
ruby_whisper_model *rwm;
rwm = ALLOC(ruby_whisper_model);
return Data_Wrap_Struct(klass, rb_whisper_model_mark, RUBY_DEFAULT_FREE, rwm);
}
static VALUE rb_whisper_model_initialize(VALUE context) {
ruby_whisper_model *rwm;
const VALUE model = ruby_whisper_model_allocate(cModel);
Data_Get_Struct(model, ruby_whisper_model, rwm);
rwm->context = context;
return model;
};
/*
* call-seq:
* model -> Whisper::Model
*/
static VALUE ruby_whisper_get_model(VALUE self) {
return rb_whisper_model_initialize(self);
}
/*
* call-seq:
* n_vocab -> Integer
*/
static VALUE ruby_whisper_c_model_n_vocab(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_vocab(rw->context));
}
/*
* call-seq:
* n_audio_ctx -> Integer
*/
static VALUE ruby_whisper_c_model_n_audio_ctx(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
}
/*
* call-seq:
* n_audio_state -> Integer
*/
static VALUE ruby_whisper_c_model_n_audio_state(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_state(rw->context));
}
/*
* call-seq:
* n_audio_head -> Integer
*/
static VALUE ruby_whisper_c_model_n_audio_head(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_head(rw->context));
}
/*
* call-seq:
* n_audio_layer -> Integer
*/
static VALUE ruby_whisper_c_model_n_audio_layer(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_audio_layer(rw->context));
}
/*
* call-seq:
* n_text_ctx -> Integer
*/
static VALUE ruby_whisper_c_model_n_text_ctx(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_ctx(rw->context));
}
/*
* call-seq:
* n_text_state -> Integer
*/
static VALUE ruby_whisper_c_model_n_text_state(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_state(rw->context));
}
/*
* call-seq:
* n_text_head -> Integer
*/
static VALUE ruby_whisper_c_model_n_text_head(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_head(rw->context));
}
/*
* call-seq:
* n_text_layer -> Integer
*/
static VALUE ruby_whisper_c_model_n_text_layer(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_text_layer(rw->context));
}
/*
* call-seq:
* n_mels -> Integer
*/
static VALUE ruby_whisper_c_model_n_mels(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_n_mels(rw->context));
}
/*
* call-seq:
* ftype -> Integer
*/
static VALUE ruby_whisper_c_model_ftype(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return INT2NUM(whisper_model_ftype(rw->context));
}
/*
* call-seq:
* type -> String
*/
static VALUE ruby_whisper_c_model_type(VALUE self) {
ruby_whisper_model *rwm;
Data_Get_Struct(self, ruby_whisper_model, rwm);
ruby_whisper *rw;
Data_Get_Struct(rwm->context, ruby_whisper, rw);
return rb_str_new2(whisper_model_type_readable(rw->context));
}
static VALUE ruby_whisper_error_initialize(VALUE self, VALUE code) {
const int c_code = NUM2INT(code);
char *raw_message;
switch (c_code) {
case -2:
raw_message = "failed to compute log mel spectrogram";
break;
case -3:
raw_message = "failed to auto-detect language";
break;
case -4:
raw_message = "too many decoders requested";
break;
case -5:
raw_message = "audio_ctx is larger than the maximum allowed";
break;
case -6:
raw_message = "failed to encode";
break;
case -7:
raw_message = "whisper_kv_cache_init() failed for self-attention cache";
break;
case -8:
raw_message = "failed to decode";
break;
case -9:
raw_message = "failed to decode";
break;
default:
raw_message = "unknown error";
break;
}
const VALUE message = rb_str_new2(raw_message);
rb_call_super(1, &message);
rb_iv_set(self, "@code", code);
return self;
}
void Init_whisper() {
id_to_s = rb_intern("to_s");
id_call = rb_intern("call");
id___method__ = rb_intern("__method__");
id_to_enum = rb_intern("to_enum");
id_length = rb_intern("length");
id_next = rb_intern("next");
id_new = rb_intern("new");
mWhisper = rb_define_module("Whisper");
cContext = rb_define_class_under(mWhisper, "Context", rb_cObject);
cParams = rb_define_class_under(mWhisper, "Params", rb_cObject);
eError = rb_define_class_under(mWhisper, "Error", rb_eStandardError);
rb_define_const(mWhisper, "LOG_LEVEL_NONE", INT2NUM(GGML_LOG_LEVEL_NONE));
rb_define_const(mWhisper, "LOG_LEVEL_INFO", INT2NUM(GGML_LOG_LEVEL_INFO));
rb_define_const(mWhisper, "LOG_LEVEL_WARN", INT2NUM(GGML_LOG_LEVEL_WARN));
rb_define_const(mWhisper, "LOG_LEVEL_ERROR", INT2NUM(GGML_LOG_LEVEL_ERROR));
rb_define_const(mWhisper, "LOG_LEVEL_DEBUG", INT2NUM(GGML_LOG_LEVEL_DEBUG));
rb_define_const(mWhisper, "LOG_LEVEL_CONT", INT2NUM(GGML_LOG_LEVEL_CONT));
rb_define_singleton_method(mWhisper, "lang_max_id", ruby_whisper_s_lang_max_id, 0);
rb_define_singleton_method(mWhisper, "lang_id", ruby_whisper_s_lang_id, 1);
rb_define_singleton_method(mWhisper, "lang_str", ruby_whisper_s_lang_str, 1);
rb_define_singleton_method(mWhisper, "lang_str_full", ruby_whisper_s_lang_str_full, 1);
rb_define_singleton_method(mWhisper, "log_set", ruby_whisper_s_log_set, 2);
rb_define_singleton_method(mWhisper, "finalize_log_callback", ruby_whisper_s_finalize_log_callback, 1);
rb_define_alloc_func(cContext, ruby_whisper_allocate);
rb_define_method(cContext, "initialize", ruby_whisper_initialize, -1);
rb_define_method(cContext, "transcribe", ruby_whisper_transcribe, -1);
rb_define_method(cContext, "model_n_vocab", ruby_whisper_model_n_vocab, 0);
rb_define_method(cContext, "model_n_audio_ctx", ruby_whisper_model_n_audio_ctx, 0);
rb_define_method(cContext, "model_n_audio_state", ruby_whisper_model_n_audio_state, 0);
rb_define_method(cContext, "model_n_audio_head", ruby_whisper_model_n_audio_head, 0);
rb_define_method(cContext, "model_n_audio_layer", ruby_whisper_model_n_audio_layer, 0);
rb_define_method(cContext, "model_n_text_ctx", ruby_whisper_model_n_text_ctx, 0);
rb_define_method(cContext, "model_n_text_state", ruby_whisper_model_n_text_state, 0);
rb_define_method(cContext, "model_n_text_head", ruby_whisper_model_n_text_head, 0);
rb_define_method(cContext, "model_n_text_layer", ruby_whisper_model_n_text_layer, 0);
rb_define_method(cContext, "model_n_mels", ruby_whisper_model_n_mels, 0);
rb_define_method(cContext, "model_ftype", ruby_whisper_model_ftype, 0);
rb_define_method(cContext, "model_type", ruby_whisper_model_type, 0);
rb_define_method(cContext, "full_n_segments", ruby_whisper_full_n_segments, 0);
rb_define_method(cContext, "full_lang_id", ruby_whisper_full_lang_id, 0);
rb_define_method(cContext, "full_get_segment_t0", ruby_whisper_full_get_segment_t0, 1);
rb_define_method(cContext, "full_get_segment_t1", ruby_whisper_full_get_segment_t1, 1);
rb_define_method(cContext, "full_get_segment_speaker_turn_next", ruby_whisper_full_get_segment_speaker_turn_next, 1);
rb_define_method(cContext, "full_get_segment_text", ruby_whisper_full_get_segment_text, 1);
rb_define_method(cContext, "full", ruby_whisper_full, -1);
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
rb_define_alloc_func(cParams, ruby_whisper_params_allocate);
@ -1836,9 +1272,6 @@ void Init_whisper() {
rb_define_method(cParams, "abort_callback=", ruby_whisper_params_set_abort_callback, 1);
rb_define_method(cParams, "abort_callback_user_data=", ruby_whisper_params_set_abort_callback_user_data, 1);
rb_define_attr(eError, "code", true, false);
rb_define_method(eError, "initialize", ruby_whisper_error_initialize, 1);
// High leve
cSegment = rb_define_class_under(mWhisper, "Segment", rb_cObject);
@ -1851,22 +1284,6 @@ void Init_whisper() {
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
cModel = rb_define_class_under(mWhisper, "Model", rb_cObject);
rb_define_alloc_func(cModel, ruby_whisper_model_allocate);
rb_define_method(cContext, "model", ruby_whisper_get_model, 0);
rb_define_method(cModel, "n_vocab", ruby_whisper_c_model_n_vocab, 0);
rb_define_method(cModel, "n_audio_ctx", ruby_whisper_c_model_n_audio_ctx, 0);
rb_define_method(cModel, "n_audio_state", ruby_whisper_c_model_n_audio_state, 0);
rb_define_method(cModel, "n_audio_head", ruby_whisper_c_model_n_audio_head, 0);
rb_define_method(cModel, "n_audio_layer", ruby_whisper_c_model_n_audio_layer, 0);
rb_define_method(cModel, "n_text_ctx", ruby_whisper_c_model_n_text_ctx, 0);
rb_define_method(cModel, "n_text_state", ruby_whisper_c_model_n_text_state, 0);
rb_define_method(cModel, "n_text_head", ruby_whisper_c_model_n_text_head, 0);
rb_define_method(cModel, "n_text_layer", ruby_whisper_c_model_n_text_layer, 0);
rb_define_method(cModel, "n_mels", ruby_whisper_c_model_n_mels, 0);
rb_define_method(cModel, "ftype", ruby_whisper_c_model_ftype, 0);
rb_define_method(cModel, "type", ruby_whisper_c_model_type, 0);
}
#ifdef __cplusplus
}

View File

@ -1,6 +0,0 @@
require "yaml"
sources = `git ls-files -z ../..`.split("\x0")
paths = YAML.load_file("../../.github/workflows/bindings-ruby.yml")[true]["push"]["paths"]
paths.delete "bindings/ruby/**"
EXTSOURCES = (Dir.glob(paths, base: "../..").collect {|path| "../../#{path}"} << "../../LICENSE") & sources

View File

@ -0,0 +1,29 @@
---
- ../../src/whisper.cpp
- ../../include/whisper.h
- ../../ggml/src/ggml.c
- ../../ggml/src/ggml-impl.h
- ../../ggml/src/ggml-aarch64.h
- ../../ggml/src/ggml-aarch64.c
- ../../ggml/src/ggml-alloc.c
- ../../ggml/src/ggml-backend-impl.h
- ../../ggml/src/ggml-backend.cpp
- ../../ggml/src/ggml-common.h
- ../../ggml/src/ggml-quants.h
- ../../ggml/src/ggml-quants.c
- ../../ggml/src/ggml-cpu-impl.h
- ../../ggml/src/ggml-metal.m
- ../../ggml/src/ggml-metal.metal
- ../../ggml/src/ggml-blas.cpp
- ../../ggml/include/ggml.h
- ../../ggml/include/ggml-alloc.h
- ../../ggml/include/ggml-backend.h
- ../../ggml/include/ggml-cuda.h
- ../../ggml/include/ggml-kompute.h
- ../../ggml/include/ggml-metal.h
- ../../ggml/include/ggml-sycl.h
- ../../ggml/include/ggml-vulkan.h
- ../../ggml/include/ggml-blas.h
- ../../scripts/get-flags.mk
- ../../examples/dr_wav.h
- ../../LICENSE

View File

@ -1,8 +0,0 @@
require "test/unit"
require "whisper"
require_relative "jfk_reader/jfk_reader"
class TestBase < Test::Unit::TestCase
MODEL = File.join(__dir__, "..", "..", "..", "models", "ggml-base.en.bin")
AUDIO = File.join(__dir__, "..", "..", "..", "samples", "jfk.wav")
end

View File

@ -1,5 +0,0 @@
Makefile
jfk_reader.o
jfk_reader.so
jfk_reader.bundle
jfk_reader.dll

View File

@ -1,3 +0,0 @@
require "mkmf"
create_makefile("jfk_reader")

View File

@ -1,108 +0,0 @@
#include <ruby.h>
#include <ruby/memory_view.h>
#include <ruby/encoding.h>
static VALUE
jfk_reader_initialize(VALUE self, VALUE audio_path)
{
rb_iv_set(self, "audio_path", audio_path);
return Qnil;
}
static bool
jfk_reader_get_memory_view(const VALUE obj, rb_memory_view_t *view, int flags)
{
VALUE audio_path = rb_iv_get(obj, "audio_path");
const char *audio_path_str = StringValueCStr(audio_path);
const int n_samples = 176000;
float *data = (float *)malloc(n_samples * sizeof(float));
short *samples = (short *)malloc(n_samples * sizeof(short));
FILE *file = fopen(audio_path_str, "rb");
fseek(file, 78, SEEK_SET);
fread(samples, sizeof(short), n_samples, file);
fclose(file);
for (int i = 0; i < n_samples; i++) {
data[i] = samples[i]/32768.0;
}
view->obj = obj;
view->data = (void *)data;
view->byte_size = sizeof(float) * n_samples;
view->readonly = true;
view->format = "f";
view->item_size = sizeof(float);
view->item_desc.components = NULL;
view->item_desc.length = 0;
view->ndim = 1;
view->shape = NULL;
view->sub_offsets = NULL;
view->private_data = NULL;
return true;
}
static bool
jfk_reader_release_memory_view(const VALUE obj, rb_memory_view_t *view)
{
return true;
}
static bool
jfk_reader_memory_view_available_p(const VALUE obj)
{
return true;
}
static const rb_memory_view_entry_t jfk_reader_view_entry = {
jfk_reader_get_memory_view,
jfk_reader_release_memory_view,
jfk_reader_memory_view_available_p
};
static VALUE
read_jfk(int argc, VALUE *argv, VALUE obj)
{
const char *audio_path_str = StringValueCStr(argv[0]);
const int n_samples = 176000;
short samples[n_samples];
FILE *file = fopen(audio_path_str, "rb");
fseek(file, 78, SEEK_SET);
fread(samples, sizeof(short), n_samples, file);
fclose(file);
VALUE rb_samples = rb_ary_new2(n_samples);
for (int i = 0; i < n_samples; i++) {
rb_ary_push(rb_samples, INT2FIX(samples[i]));
}
VALUE rb_data = rb_ary_new2(n_samples);
for (int i = 0; i < n_samples; i++) {
rb_ary_push(rb_data, DBL2NUM(samples[i]/32768.0));
}
float data[n_samples];
for (int i = 0; i < n_samples; i++) {
data[i] = samples[i]/32768.0;
}
void *c_data = (void *)data;
VALUE rb_void = rb_enc_str_new((const char *)c_data, sizeof(data), rb_ascii8bit_encoding());
VALUE rb_result = rb_ary_new3(3, rb_samples, rb_data, rb_void);
return rb_result;
}
void Init_jfk_reader(void)
{
VALUE cJFKReader = rb_define_class("JFKReader", rb_cObject);
rb_memory_view_register(cJFKReader, &jfk_reader_view_entry);
rb_define_method(cJFKReader, "initialize", jfk_reader_initialize, 1);
rb_define_global_function("read_jfk", read_jfk, -1);
}

View File

@ -1,20 +0,0 @@
require_relative "helper"
class TestError < TestBase
def test_error
error = Whisper::Error.new(-2)
assert_equal "failed to compute log mel spectrogram", error.message
assert_equal -2, error.code
end
def test_unknown_error
error = Whisper::Error.new(-20)
assert_equal "unknown error", error.message
end
def test_non_int_code
assert_raise TypeError do
error = Whisper::Error.new("non int")
end
end
end

View File

@ -1,44 +0,0 @@
require_relative "helper"
class TestModel < TestBase
def test_model
whisper = Whisper::Context.new(MODEL)
assert_instance_of Whisper::Model, whisper.model
end
def test_attributes
whisper = Whisper::Context.new(MODEL)
model = whisper.model
assert_equal 51864, model.n_vocab
assert_equal 1500, model.n_audio_ctx
assert_equal 512, model.n_audio_state
assert_equal 8, model.n_audio_head
assert_equal 6, model.n_audio_layer
assert_equal 448, model.n_text_ctx
assert_equal 512, model.n_text_state
assert_equal 8, model.n_text_head
assert_equal 6, model.n_text_layer
assert_equal 80, model.n_mels
assert_equal 1, model.ftype
assert_equal "base", model.type
end
def test_gc
model = Whisper::Context.new(MODEL).model
GC.start
assert_equal 51864, model.n_vocab
assert_equal 1500, model.n_audio_ctx
assert_equal 512, model.n_audio_state
assert_equal 8, model.n_audio_head
assert_equal 6, model.n_audio_layer
assert_equal 448, model.n_text_ctx
assert_equal 512, model.n_text_state
assert_equal 8, model.n_text_head
assert_equal 6, model.n_text_layer
assert_equal 80, model.n_mels
assert_equal 1, model.ftype
assert_equal "base", model.type
end
end

View File

@ -1,9 +1,9 @@
require_relative "helper"
require 'test/unit'
require 'tempfile'
require 'tmpdir'
require 'shellwords'
class TestPackage < TestBase
class TestPackage < Test::Unit::TestCase
def test_build
Tempfile.create do |file|
assert system("gem", "build", "whispercpp.gemspec", "--output", file.to_path.shellescape, exception: true)

View File

@ -1,6 +1,7 @@
require_relative "helper"
require 'test/unit'
require 'whisper'
class TestParams < TestBase
class TestParams < Test::Unit::TestCase
def setup
@params = Whisper::Params.new
end

View File

@ -1,14 +1,18 @@
require_relative "helper"
require "test/unit"
require "whisper"
class TestSegment < Test::Unit::TestCase
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
class TestSegment < TestBase
class << self
attr_reader :whisper
def startup
@whisper = Whisper::Context.new(TestBase::MODEL)
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new
params.print_timestamps = false
@whisper.transcribe(TestBase::AUDIO, params)
jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
@whisper.transcribe(jfk, params)
end
end
@ -56,7 +60,7 @@ class TestSegment < TestBase
end
index += 1
end
whisper.transcribe(AUDIO, params)
whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
assert_equal 0, seg.start_time
assert_match /ask not what your country can do for you, ask what you can do for your country/, seg.text
end
@ -72,7 +76,7 @@ class TestSegment < TestBase
assert_same seg, segment
return
end
whisper.transcribe(AUDIO, params)
whisper.transcribe(File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav'), params)
end
private

View File

@ -1,21 +1,20 @@
require_relative "helper"
require "stringio"
require "etc"
require 'whisper'
require 'test/unit'
# Exists to detect memory-related bug
Whisper.log_set ->(level, buffer, user_data) {}, nil
class TestWhisper < Test::Unit::TestCase
TOPDIR = File.expand_path(File.join(File.dirname(__FILE__), '..'))
class TestWhisper < TestBase
def setup
@params = Whisper::Params.new
end
def test_whisper
@whisper = Whisper::Context.new(MODEL)
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new
params.print_timestamps = false
@whisper.transcribe(AUDIO, params) {|text|
jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
@whisper.transcribe(jfk, params) {|text|
assert_match /ask not what your country can do for you, ask what you can do for your country/, text
}
end
@ -25,10 +24,11 @@ class TestWhisper < TestBase
attr_reader :whisper
def startup
@whisper = Whisper::Context.new(TestBase::MODEL)
@whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
params = Whisper::Params.new
params.print_timestamps = false
@whisper.transcribe(TestBase::AUDIO, params)
jfk = File.join(TOPDIR, '..', '..', 'samples', 'jfk.wav')
@whisper.transcribe(jfk, params)
end
end
@ -96,131 +96,4 @@ class TestWhisper < TestBase
Whisper.lang_str_full(Whisper.lang_max_id + 1)
end
end
def test_log_set
user_data = Object.new
logs = []
log_callback = ->(level, buffer, udata) {
logs << [level, buffer, udata]
}
Whisper.log_set log_callback, user_data
Whisper::Context.new(MODEL)
assert logs.length > 30
logs.each do |log|
assert_include [Whisper::LOG_LEVEL_DEBUG, Whisper::LOG_LEVEL_INFO, Whisper::LOG_LEVEL_WARN], log[0]
assert_same user_data, log[2]
end
end
def test_log_suppress
stderr = $stderr
Whisper.log_set ->(level, buffer, user_data) {
# do nothing
}, nil
dev = StringIO.new("")
$stderr = dev
Whisper::Context.new(MODEL)
assert_empty dev.string
ensure
$stderr = stderr
end
sub_test_case "full" do
def setup
super
@whisper = Whisper::Context.new(MODEL)
@samples = File.read(AUDIO, nil, 78).unpack("s<*").collect {|i| i.to_f / 2**15}
end
def test_full
@whisper.full(@params, @samples, @samples.length)
assert_equal 1, @whisper.full_n_segments
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
end
def test_full_without_length
@whisper.full(@params, @samples)
assert_equal 1, @whisper.full_n_segments
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
end
def test_full_enumerator
samples = @samples.each
@whisper.full(@params, samples, @samples.length)
assert_equal 1, @whisper.full_n_segments
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
end
def test_full_enumerator_without_length
samples = @samples.each
assert_raise ArgumentError do
@whisper.full(@params, samples)
end
end
def test_full_enumerator_with_too_large_length
samples = @samples.each.take(10).to_enum
assert_raise StopIteration do
@whisper.full(@params, samples, 11)
end
end
def test_full_with_memory_view
samples = JFKReader.new(AUDIO)
@whisper.full(@params, samples)
assert_equal 1, @whisper.full_n_segments
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
end
def test_full_parallel
@whisper.full_parallel(@params, @samples, @samples.length, Etc.nprocessors)
assert_equal Etc.nprocessors, @whisper.full_n_segments
text = @whisper.each_segment.collect(&:text).join
assert_match /ask what you can do/i, text
assert_match /for your country/i, text
end
def test_full_parallel_with_memory_view
samples = JFKReader.new(AUDIO)
@whisper.full_parallel(@params, samples, nil, Etc.nprocessors)
assert_equal Etc.nprocessors, @whisper.full_n_segments
text = @whisper.each_segment.collect(&:text).join
assert_match /ask what you can do/i, text
assert_match /for your country/i, text
end
def test_full_parallel_without_length_and_n_processors
@whisper.full_parallel(@params, @samples)
assert_equal 1, @whisper.full_n_segments
text = @whisper.each_segment.collect(&:text).join
assert_match /ask what you can do/i, text
assert_match /for your country/i, text
end
def test_full_parallel_without_length
@whisper.full_parallel(@params, @samples, nil, Etc.nprocessors)
assert_equal Etc.nprocessors, @whisper.full_n_segments
text = @whisper.each_segment.collect(&:text).join
assert_match /ask what you can do/i, text
assert_match /for your country/i, text
end
def test_full_parallel_without_n_processors
@whisper.full_parallel(@params, @samples, @samples.length)
assert_equal 1, @whisper.full_n_segments
text = @whisper.each_segment.collect(&:text).join
assert_match /ask what you can do/i, text
assert_match /for your country/i, text
end
end
end

View File

@ -1,4 +1,4 @@
require_relative "extsources"
require "yaml"
Gem::Specification.new do |s|
s.name = "whispercpp"
@ -10,24 +10,24 @@ Gem::Specification.new do |s|
s.extra_rdoc_files = ['LICENSE', 'README.md']
s.files = `git ls-files . -z`.split("\x0") +
EXTSOURCES.collect {|file|
YAML.load_file("extsources.yaml").collect {|file|
basename = File.basename(file)
if s.extra_rdoc_files.include?(basename)
basename
else
file.sub("../..", "ext")
File.join("ext", basename)
end
}
s.summary = %q{Ruby whisper.cpp bindings}
s.test_files = s.files.select {|file| file.start_with? "tests/"}
s.test_files = ["tests/test_whisper.rb"]
s.extensions << 'ext/extconf.rb'
#### Documentation and testing.
s.homepage = 'https://github.com/ggerganov/whisper.cpp'
s.rdoc_options = ['--main', 'README.md']
s.rdoc_options = ['--main', '../../README.md']
s.platform = Gem::Platform::RUBY

View File

@ -1,10 +1,10 @@
prefix=@CMAKE_INSTALL_PREFIX@
exec_prefix=${prefix}
libdir=${exec_prefix}/lib
libdir=@CMAKE_INSTALL_FULL_LIBDIR@
includedir=${prefix}/include
Name: whisper
Description: Port of OpenAI's Whisper model in C/C++
Version: @PROJECT_VERSION@
Libs: -L${libdir} -lggml -lggml-base -lwhisper
Libs: -L${libdir} -lwhisper
Cflags: -I${includedir}

View File

@ -137,7 +137,7 @@ if (WHISPER_SDL2)
set_target_properties(lsp PROPERTIES FOLDER "examples")
if (GGML_SYCL)
add_subdirectory(sycl)
set_target_properties(ls-sycl-device PROPERTIES FOLDER "examples")
set_target_properties(sycl PROPERTIES FOLDER "examples")
endif()
endif (WHISPER_SDL2)
endif()

View File

@ -217,7 +217,6 @@ bool ggml_common_quantize_0(
case GGML_TYPE_Q4_0_8_8:
case GGML_TYPE_TQ1_0:
case GGML_TYPE_TQ2_0:
case GGML_TYPE_IQ4_NL_4_4:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));

View File

@ -204,6 +204,8 @@ static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
const size_t errbuffsize = 1024;
char errbuff[errbuffsize];
av_register_all(); // from avformat. Still a must-have call for ffmpeg v3! (can be skipped for later versions)
fmt_ctx = avformat_alloc_context();
avio_ctx_buffer = (u8*)av_malloc(AVIO_CTX_BUF_SZ);
LOG("Creating an avio context: AVIO_CTX_BUF_SZ=%d\n", AVIO_CTX_BUF_SZ);

View File

@ -5,5 +5,5 @@
set(TARGET ls-sycl-device)
add_executable(${TARGET} ls-sycl-device.cpp)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_17)

View File

@ -7,16 +7,13 @@ cd build
source /opt/intel/oneapi/setvars.sh
#for FP16
#cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference
#cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DWHISPER_SYCL_F16=ON # faster for long-prompt inference
#for FP32
cmake .. -DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#for other features from the examples, e.g. stream and talk link with SDL2:
#cmake .. -DGGML_SYCL=ON -DWHISPER_SDL2=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
cmake .. -DWHISPER_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
#build example/main only
#cmake --build . --config Release --target main
#build all binary
cmake --build . --config Release -v
cmake --build . --config Release -v

View File

@ -63,30 +63,6 @@ static void llama_log_softmax(float * array, size_t size) {
}
*/
static void llama_sampler_temp_impl(llama_token_data_array * cur_p, float temp) {
if (temp <= 0.0f) {
// find the token with the highest logit and set the rest to -inf
size_t max_i = 0;
float max_l = cur_p->data[0].logit;
for (size_t i = 1; i < cur_p->size; ++i) {
if (cur_p->data[i ].logit > max_l) {
cur_p->data[max_i].logit = -INFINITY;
max_i = i;
max_l = cur_p->data[i].logit;
} else {
cur_p->data[i].logit = -INFINITY;
}
}
return;
}
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].logit /= temp;
}
}
static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
GGML_ASSERT(cur_p->size > 0);
@ -113,7 +89,7 @@ static void llama_sampler_softmax_impl(llama_token_data_array * cur_p) {
}
static void llama_sampler_top_k_impl(llama_token_data_array * cur_p, int32_t k) {
// TODO: move bucket sort to separate function so that top_p/typical/softmax first is equally fast
// TODO: move bucket sort to separate function so that top_p/tail_free/typical/softmax first is equally fast
// if (k >= (int32_t)cur_p->size) {
// return;
// }
@ -451,9 +427,6 @@ static const char * llama_sampler_dist_name(const struct llama_sampler * /*smpl*
static void llama_sampler_dist_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_dist *) smpl->ctx;
llama_sampler_softmax_impl(cur_p);
cur_p->selected = llama_sample_dist(cur_p, ctx->rng);
}
@ -733,6 +706,101 @@ struct llama_sampler * llama_sampler_init_min_p(float p, size_t min_keep) {
};
}
// tail-free
struct llama_sampler_tail_free {
const float z;
const size_t min_keep;
};
static const char * llama_sampler_tail_free_name(const struct llama_sampler * /*smpl*/) {
return "tail-free";
}
static void llama_sampler_tail_free_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_tail_free *) smpl->ctx;
if (ctx->z >= 1.0f || cur_p->size <= 2) {
return;
}
llama_sampler_softmax_impl(cur_p);
// Compute the first and second derivatives
std::vector<float> first_derivatives(cur_p->size - 1);
std::vector<float> second_derivatives(cur_p->size - 2);
for (size_t i = 0; i < first_derivatives.size(); ++i) {
first_derivatives[i] = cur_p->data[i].p - cur_p->data[i + 1].p;
}
for (size_t i = 0; i < second_derivatives.size(); ++i) {
second_derivatives[i] = first_derivatives[i] - first_derivatives[i + 1];
}
// Calculate absolute value of second derivatives
for (size_t i = 0; i < second_derivatives.size(); ++i) {
second_derivatives[i] = std::abs(second_derivatives[i]);
}
// Normalize the second derivatives
{
const float second_derivatives_sum = std::accumulate(second_derivatives.begin(), second_derivatives.end(), 0.0f);
if (second_derivatives_sum > 1e-6f) {
for (float & value : second_derivatives) {
value /= second_derivatives_sum;
}
} else {
for (float & value : second_derivatives) {
value = 1.0f / second_derivatives.size();
}
}
}
float cum_sum = 0.0f;
size_t last_idx = cur_p->size;
for (size_t i = 0; i < second_derivatives.size(); ++i) {
cum_sum += second_derivatives[i];
// Check if the running sum is greater than z or if we have kept at least min_keep tokens
if (cum_sum > ctx->z && i >= ctx->min_keep) {
last_idx = i;
break;
}
}
// Resize the output vector to keep only the tokens above the tail location
cur_p->size = last_idx;
}
static struct llama_sampler * llama_sampler_tail_free_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_tail_free *) smpl->ctx;
return llama_sampler_init_tail_free(ctx->z, ctx->min_keep);
}
static void llama_sampler_tail_free_free(struct llama_sampler * smpl) {
delete (llama_sampler_tail_free *) smpl->ctx;
}
static struct llama_sampler_i llama_sampler_tail_free_i = {
/* .name = */ llama_sampler_tail_free_name,
/* .accept = */ nullptr,
/* .apply = */ llama_sampler_tail_free_apply,
/* .reset = */ nullptr,
/* .clone = */ llama_sampler_tail_free_clone,
/* .free = */ llama_sampler_tail_free_free,
};
struct llama_sampler * llama_sampler_init_tail_free(float z, size_t min_keep) {
return new llama_sampler {
/* .iface = */ &llama_sampler_tail_free_i,
/* .ctx = */ new llama_sampler_tail_free {
/* .z = */ z,
/*. min_keep = */ min_keep,
},
};
}
// typical
struct llama_sampler_typical {
@ -844,8 +912,9 @@ static const char * llama_sampler_temp_name(const struct llama_sampler * /*smpl*
static void llama_sampler_temp_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
const auto * ctx = (llama_sampler_temp *) smpl->ctx;
llama_sampler_temp_impl(cur_p, ctx->temp);
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].logit /= ctx->temp;
}
}
static struct llama_sampler * llama_sampler_temp_clone(const struct llama_sampler * smpl) {
@ -892,7 +961,6 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
if (ctx->delta > 0) {
const float min_temp = std::max(0.0f, ctx->temp - ctx->delta);
const float max_temp = ctx->temp + ctx->delta;
float exponent_val = ctx->exponent;
// no need to do anything if there is only one (or zero) candidates
@ -930,7 +998,9 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
#endif
// Apply the dynamically calculated temperature scaling
llama_sampler_temp_impl(cur_p, dyn_temp);
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].logit /= dyn_temp;
}
// Re-compute softmax probabilities after scaling logits with dynamic temperature
const double max_l_double = cur_p->data[0].logit;
@ -954,7 +1024,9 @@ static void llama_sampler_temp_ext_apply(struct llama_sampler * smpl, llama_toke
}
#endif
} else {
llama_sampler_temp_impl(cur_p, ctx->temp);
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].logit /= ctx->temp;
}
}
}
@ -987,101 +1059,6 @@ struct llama_sampler * llama_sampler_init_temp_ext(float temp, float delta, floa
};
}
// xtc
struct llama_sampler_xtc {
const float probability;
const float threshold;
const size_t min_keep;
const uint32_t seed;
uint32_t seed_cur;
std::mt19937 rng;
};
static const char * llama_sampler_xtc_name(const struct llama_sampler * /*smpl*/) {
return "xtc";
}
static void llama_sample_xtc_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_xtc *) smpl->ctx;
if (ctx->probability <= 0.0f
|| ctx->threshold > 0.5f
|| cur_p->size < 2) {
return;
}
std::uniform_real_distribution<float> distribution(0.0f, 1.0f);
float chance = distribution(ctx->rng);
if (chance > ctx->probability) return;
// in case it's not sorted/recalculated yet
llama_sampler_softmax_impl(cur_p);
int pos_last = 0;
for (size_t i = 0; i < cur_p->size; ++i) {
if (cur_p->data[i].p >= ctx->threshold) {
pos_last = i;
} else break;
}
if (cur_p->size - pos_last >= ctx->min_keep && pos_last > 0) {
cur_p->data += pos_last;
cur_p->size -= pos_last;
}
}
static struct llama_sampler * llama_sampler_xtc_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_xtc *) smpl->ctx;
auto * result = llama_sampler_init_xtc(ctx->probability, ctx->threshold, ctx->min_keep, ctx->seed);
// copy the state
{
auto * result_ctx = (llama_sampler_xtc *) result->ctx;
result_ctx->rng = ctx->rng;
}
return result;
}
static void llama_sampler_xtc_free(struct llama_sampler * smpl) {
delete (llama_sampler_xtc *) smpl->ctx;
}
static void llama_sampler_xtc_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_xtc *) smpl->ctx;
ctx->seed_cur = get_rng_seed(ctx->seed);
ctx->rng.seed(ctx->seed_cur);
}
static struct llama_sampler_i llama_sampler_xtc_i = {
/* .name = */ llama_sampler_xtc_name,
/* .accept = */ nullptr,
/* .apply = */ llama_sample_xtc_apply,
/* .reset = */ llama_sampler_xtc_reset,
/* .clone = */ llama_sampler_xtc_clone,
/* .free = */ llama_sampler_xtc_free,
};
struct llama_sampler * llama_sampler_init_xtc(float p, float t, size_t min_keep, uint32_t seed) {
auto seed_cur = get_rng_seed(seed);
return new llama_sampler {
/* .iface = */ &llama_sampler_xtc_i,
/* .ctx = */ new llama_sampler_xtc {
/* .probability = */ p,
/* .threshold = */ t,
/* .min_keep = */ min_keep,
/* .seed = */ seed,
/* .seed_cur = */ seed_cur,
/* .rng = */ std::mt19937(seed_cur),
},
};
}
// mirostat
struct llama_sampler_mirostat {
@ -1588,400 +1565,6 @@ struct llama_sampler * llama_sampler_init_penalties(
};
}
// DRY
struct llama_sampler_dry {
int32_t total_context_size;
const float dry_multiplier;
const float dry_base;
const int32_t dry_allowed_length;
const int32_t dry_penalty_last_n;
std::unordered_multimap<llama_token, std::vector<llama_token>> dry_processed_breakers;
std::vector<int> dry_repeat_count;
std::unordered_map<llama_token, int> dry_max_token_repeat;
ring_buffer<llama_token> last_tokens;
};
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
static void get_overlapping_token_sequences(const llama_vocab & vocab, const std::string& str, std::unordered_multimap<llama_token, std::vector<llama_token>>& token_sequences, int max_tail_len = -1) {
for (llama_token token_id = 0; token_id < (llama_token)vocab.n_vocab; token_id++) {
std::string word = llama_detokenize(vocab, {token_id}, true);
if (word.find(str) != std::string::npos) {
token_sequences.emplace(token_id, std::vector<llama_token>());
} else {
size_t word_len = word.size(), str_len = str.size();
size_t pos = -1;
while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
bool match = true;
size_t i;
for (i = 1; i < str_len && i + pos < word_len; ++i) {
if (word[pos + i] != str[i]) {
match = false;
break;
}
}
if (match) {
std::vector<llama_token> tokenization = llama_tokenize_internal(vocab, str.substr(i), false, false);
if (max_tail_len >= 0 && tokenization.size() > (size_t)max_tail_len) {
tokenization.resize(max_tail_len);
}
// Ensure we don't already have a duplicate matching tokenization
auto its = token_sequences.equal_range(token_id);
bool found = false;
for (auto it = its.first; it != its.second; ++it) {
if (tokenization == it->second) {
found = true;
break;
}
}
if (!found) {
token_sequences.emplace(token_id, tokenization);
}
}
}
}
}
}
static const char * llama_sampler_dry_name(const struct llama_sampler * /*smpl*/) {
return "dry";
}
static void llama_sampler_dry_accept(struct llama_sampler * smpl, llama_token token) {
auto * ctx = (llama_sampler_dry *) smpl->ctx;
if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
return;
}
ctx->last_tokens.push_back(token);
}
// Ported from Koboldcpp, original PR: https://github.com/LostRuins/koboldcpp/pull/982 (Original author: pi6am)
static void llama_sampler_dry_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_dry *) smpl->ctx;
if (ctx->dry_multiplier == 0.0f || ctx->dry_base < 1.0f || ctx->dry_penalty_last_n == 0) {
return;
}
int32_t effective_dry_penalty_last_n = (ctx->dry_penalty_last_n == -1) ? ctx->total_context_size : std::max(ctx->dry_penalty_last_n, 0);
int last_n_repeat = std::min(std::min((int)ctx->last_tokens.size(), effective_dry_penalty_last_n), ctx->total_context_size);
if (last_n_repeat <= ctx->dry_allowed_length) {
return;
}
ctx->dry_repeat_count.assign(last_n_repeat, 0);
ctx->dry_max_token_repeat.clear();
// Step 1: Look for restart sequences to limit the maximum repetition length.
// Work backwards through the context looking for any token that begins a restart sequence.
//
// The collection `restart_sequences` is a mapping from a "head" token to all "tail"
// sequences that together comprise a restart sequence. This allows us to quickly check
// whether each token is the head of a complete sequence. Most restart sequences are actually
// a single token, and for these the "tail" is an empty vector.
//
// If the token is a "head", test all restart sequences that begin with this token
// (there will often only be one sequence for each token, but if sequences like 'aaaq1' and
// 'aaa1' are used as restart strings, both could start with 'aaa' when tokenized). The
// longest matching sequence (if any) is used to limit the maximum repetition length.
//
// Note that in the case case of a short sequence contained in a longer one, this might fail to
// find the smallest value for `rep_limit`. For example, if 'amniotic' and 'ni' are both used as
// restart sequences, 'ni' will be found first, and since it's shorter it will fail to suppress
// 'otic'. This is a minor issue since fully contained restart sequences are likely to be rare.
//
// This is theoretically worst-case O(N^2) for arbitrary restart sequences, which is why we
// have already clamped the maximum tail sequence length when generating `restart_sequences`.
// With clamping, this scan is O(N) in the context length.
int rep_limit = last_n_repeat;
for (int i = 0; i < last_n_repeat; ++i) {
llama_token token = ctx->last_tokens.rat(i);
auto its = ctx->dry_processed_breakers.equal_range(token);
if (its.first == ctx->dry_processed_breakers.end()) {
continue;
}
int longest_match = -1;
for (auto it = its.first; it != its.second; ++it) {
// Note that (*it) does not contain the head character, so seq_len will be
// the restart sequence length minus 1.
// In the common case of a single-token restart sequence, (*it) will be empty
// and we will trivially match.
int seq_len = (int)it->second.size();
if (seq_len > longest_match && seq_len <= (int)i) {
bool match = true;
for (int offset = 0; offset < seq_len; ++offset) {
// The -1 when indexing `last_tokens` is because we already matched the head.
if (it->second[offset] != ctx->last_tokens.rat(i - offset - 1)) {
match = false;
break;
}
}
if (match) {
longest_match = seq_len;
}
}
}
if (longest_match >= 0) {
// We found a restart sequence starting `i` tokens from the end and continuing for
// `longest_match` tokens.
rep_limit = i - longest_match;
break;
}
}
if (rep_limit < ctx->dry_allowed_length) {
return;
}
// Step 2: Iterate in reverse over the last N tokens of the context, using the "Z-algorithm" (in
// the reverse direction) to efficiently compute the positions and lengths of suffixes appearing
// elsewhere in the context. We limit the suffix length to `rep_limit` to respect restart sequences.
//
// This algorithm is not currently documented on Wikipedia, but there is a clear description here:
// https://ivanyu.me/blog/2014/10/15/z-algorithm/
//
// The code below is adapted from the public domain implementation by the same author here:
// https://github.com/ivanyu/string-algorithms/blob/master/z_algorithm.py
//
// Example:
// Last N tokens: a b c c b c y a b c
// Repeat counts: 0 0 3 1 0 2 0 0 0 0
// ^
// This `3` means that the last three tokens of the context (a b c) also appear here.
//
// This step is worst case O(N) since the Z-algorithm is linear, despite the appearance of nested
// for/while loops. This can be seen by observing that the `lt` and `rt` bounds are set after each
// repeated suffix is detected (i.e. after each while loop when n > 0). These bound variables
// ensure that the inner while loops only examine each token in the context once as the outer
// for loop iterates over the context.
{
const int last = last_n_repeat - 1;
int rt = 0, lt = 0;
for (int k = 1; k < last_n_repeat; ++k) {
if (k > rt) {
// If k is outside the current Z-box, do naive computation.
int n = 0;
while (n + k < last_n_repeat && ctx->last_tokens.rat(n) == ctx->last_tokens.rat(n+k)) {
++n;
}
ctx->dry_repeat_count[last - k] = std::min(n, rep_limit);
if (n > 0) {
lt = k;
rt = k+n-1;
}
} else {
// If k is inside the current Z-box, consider two cases.
int p = k - lt; // Pair index.
int right_part_len = rt - k + 1;
if (ctx->dry_repeat_count[last - p] < right_part_len) {
int n = std::min(ctx->dry_repeat_count[last - p], rep_limit);
ctx->dry_repeat_count[last - k] = n;
} else {
int i = rt + 1;
while (i < last_n_repeat && ctx->last_tokens.rat(i) == ctx->last_tokens.rat(i - k)) {
i += 1;
}
int n = std::min(i - k, rep_limit);
ctx->dry_repeat_count[last - k] = n;
lt = k;
rt = i - 1;
}
}
}
}
// Step 3: Iterate over dry_repeat_count and last_tokens, examining the maximum repeat length
// that would be generated by emitting each new token that would extend a sequence.
//
// Following the same example as above:
// Last N tokens: a b c c b c y a b c
// Repeat counts: 0 0 3 1 0 2 0 0 0 0
//
// For each non-zero, look ahead one token. This token, if emitted, would extend the repetition.
// c: 3 -> 4 (from `a b c` to `a b c c`)
// b: 1 -> 2 (from `c` to `c b`)
// y: 2 -> 3 (from `b c` to `b c y`)
for (int i = 0; i < last_n_repeat - 1; ++i) {
int repeat_len = ctx->dry_repeat_count[i];
if (repeat_len >= ctx->dry_allowed_length) {
// This token ends a repeat, so the next token would continue one.
// By convention, the value of `repeat_len` only includes the tokens currently
// in the context, not the new token that would be added.
llama_token token = ctx->last_tokens.rat(last_n_repeat - 2 - i);
// Track the maximum sequence ending in this token.
const auto& it = ctx->dry_max_token_repeat.find(token);
if (it == ctx->dry_max_token_repeat.end() || it->second < repeat_len) {
ctx->dry_max_token_repeat[token] = repeat_len;
}
}
}
// Step 4: Apply logit penalties based on the maximum repeat length for relevant tokens.
// Prevent floating point overflow in `pow(penalty_base, exponent)` by clamping to `max_exponent`.
// Compute it from `penalty_base` and the approximate log of `std::numeric_limits<float>::max()`
const float FLOAT_MAX_LOG = 88.7228391f;
int max_exponent = 0;
if (ctx->dry_base > 1.000001f) {
max_exponent = FLOAT_MAX_LOG / std::log(ctx->dry_base);
}
for (size_t i = 0; i < cur_p->size; ++i) {
const auto& af_kvp = ctx->dry_max_token_repeat.find(cur_p->data[i].id);
if (af_kvp != ctx->dry_max_token_repeat.end()) {
// Check all sequence breakers starting with this token
auto range = ctx->dry_processed_breakers.equal_range(cur_p->data[i].id);
bool is_single_token_breaker = false;
for (auto it = range.first; it != range.second; ++it) {
if (it->second.empty()) {
is_single_token_breaker = true;
break;
}
}
// Apply penalty only if it's not a single-token sequence breaker
if (!is_single_token_breaker) {
int repeat_exp = af_kvp->second - ctx->dry_allowed_length;
if (max_exponent > 0 && repeat_exp > max_exponent) {
repeat_exp = max_exponent;
}
float penalty = ctx->dry_multiplier * std::pow(ctx->dry_base, repeat_exp);
cur_p->data[i].logit -= penalty;
}
}
}
cur_p->sorted = false;
}
static void llama_sampler_dry_reset(struct llama_sampler * smpl) {
auto * ctx = (llama_sampler_dry *) smpl->ctx;
ctx->last_tokens.clear();
ctx->dry_repeat_count.clear();
ctx->dry_max_token_repeat.clear();
}
static struct llama_sampler * llama_sampler_dry_clone(const struct llama_sampler * smpl) {
const auto * ctx = (llama_sampler_dry *) smpl->ctx;
llama_vocab dummy_vocab;
// dummy vocab is passed because it is only needed for raw sequence breaker processing, which we have already done and will simply be copying
auto * result = llama_sampler_init_dry_impl(dummy_vocab, ctx->total_context_size, ctx->dry_multiplier, ctx->dry_base, ctx->dry_allowed_length, ctx->dry_penalty_last_n, NULL, 0);
// Copy the state, including the processed breakers
{
auto * result_ctx = (llama_sampler_dry *) result->ctx;
result_ctx->dry_processed_breakers = ctx->dry_processed_breakers;
result_ctx->dry_repeat_count = ctx->dry_repeat_count;
result_ctx->dry_max_token_repeat = ctx->dry_max_token_repeat;
result_ctx->last_tokens = ctx->last_tokens;
}
return result;
}
static void llama_sampler_dry_free(struct llama_sampler * smpl) {
delete (llama_sampler_dry *) smpl->ctx;
}
static struct llama_sampler_i llama_sampler_dry_i = {
/* .name = */ llama_sampler_dry_name,
/* .accept = */ llama_sampler_dry_accept,
/* .apply = */ llama_sampler_dry_apply,
/* .reset = */ llama_sampler_dry_reset,
/* .clone = */ llama_sampler_dry_clone,
/* .free = */ llama_sampler_dry_free,
};
struct llama_sampler * llama_sampler_init_dry_impl(const struct llama_vocab & vocab, int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const char** seq_breakers, size_t num_breakers) {
int32_t effective_dry_penalty_last_n = (dry_penalty_last_n == -1) ? context_size : std::max(dry_penalty_last_n, 0);
std::unordered_multimap<llama_token, std::vector<llama_token>> processed_breakers;
const int MAX_CHAR_LEN = 40;
const int MAX_SEQ_LEN = 20;
const bool dry_enabled = (dry_multiplier != 0.0f && dry_base >= 1.0f && dry_penalty_last_n != 0);
if (dry_enabled && seq_breakers != nullptr && num_breakers > 0) {
// Process sequence breakers
for (size_t i = 0; i < num_breakers; ++i) {
if (seq_breakers[i] == nullptr || std::strlen(seq_breakers[i]) == 0) {
LLAMA_LOG_WARN("skipping null or empty DRY sequence breaker at index %zu\n", i);
continue;
}
std::string sequence_break(seq_breakers[i]);
if (sequence_break.empty()) {
LLAMA_LOG_WARN("skipping empty DRY sequence breaker\n");
continue;
}
if (sequence_break.size() > MAX_CHAR_LEN) {
LLAMA_LOG_WARN("truncating DRY sequence breaker to %d characters\n", MAX_CHAR_LEN);
sequence_break.resize(MAX_CHAR_LEN);
}
get_overlapping_token_sequences(vocab, sequence_break, processed_breakers, MAX_SEQ_LEN);
}
}
return new llama_sampler {
/* .iface = */ &llama_sampler_dry_i,
/* .ctx = */ new llama_sampler_dry {
/* .total_context_size = */ context_size,
/* .dry_multiplier = */ dry_multiplier,
/* .dry_base = */ dry_base,
/* .dry_allowed_length = */ dry_allowed_length,
/* .dry_penalty_last_n = */ dry_penalty_last_n,
/* .dry_processed_breakers = */ std::move(processed_breakers),
/* .dry_repeat_count = */ dry_enabled ? std::vector<int>(effective_dry_penalty_last_n, 0) : std::vector<int>{},
/* .dry_max_token_repeat = */ {},
/* .last_tokens = */ dry_enabled ? ring_buffer<llama_token>(effective_dry_penalty_last_n) : ring_buffer<llama_token>(0),
},
};
}
// wrapper for test-sampling.cpp
struct llama_sampler * llama_sampler_init_dry_testing(int32_t context_size, float dry_multiplier, float dry_base, int32_t dry_allowed_length, int32_t dry_penalty_last_n, const std::vector<std::vector<llama_token>>& seq_breakers) {
llama_vocab dummy_vocab;
auto * result = llama_sampler_init_dry_impl(dummy_vocab, context_size, dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n, NULL, 0);
auto * ctx = (llama_sampler_dry *) result->ctx;
// Process the token-based sequence breakers
ctx->dry_processed_breakers.clear();
if (seq_breakers.empty()) {
LLAMA_LOG_WARN("empty DRY sequence breakers list in llama_sampler_init_dry_testing\n");
} else {
for (const auto& breaker : seq_breakers) {
if (breaker.empty()) {
LLAMA_LOG_WARN("skipping DRY empty sequence breaker\n");
continue;
}
llama_token head_token = breaker[0];
std::vector<llama_token> tail_tokens(breaker.begin() + 1, breaker.end());
ctx->dry_processed_breakers.emplace(head_token, std::move(tail_tokens));
}
if (ctx->dry_processed_breakers.empty()) {
LLAMA_LOG_WARN("no valid DRY sequence breakers processed in llama_sampler_init_dry_testing\n");
}
}
return result;
}
// logit-bias
struct llama_sampler_logit_bias {
@ -2061,229 +1644,6 @@ struct llama_sampler * llama_sampler_init_logit_bias(
};
}
// infill
//#define GGML_DEBUG_SAMPLER_INFILL
struct llama_sampler_infill {
const struct llama_vocab * vocab;
std::vector<char> buf0;
std::vector<char> buf1;
};
static const char * llama_sampler_infill_name(const struct llama_sampler * /*smpl*/) {
return "infill";
}
static void llama_sampler_infill_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
auto * ctx = (llama_sampler_infill *) smpl->ctx;
llama_sampler_softmax_impl(cur_p);
#if defined(GGML_DEBUG_SAMPLER_INFILL)
#define LOG_DBG_CUR LLAMA_LOG_DEBUG
#else
#define LOG_DBG_CUR(...)
#endif
for (size_t i = 0; i < cur_p->size; ++i) {
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
}
float p_txt_sum = 0.0f;
float p_eog_sum = 0.0f;
for (size_t i = 0; i < cur_p->size; ++i) {
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
p_eog_sum += cur_p->data[i].p;
} else {
p_txt_sum += cur_p->data[i].p;
}
}
const float rat = p_eog_sum == 0.0 ? INFINITY : p_txt_sum / p_eog_sum; GGML_UNUSED(rat);
LOG_DBG_CUR("%s: p_txt_sum = %.2f, p_eog_sum = %.2f, rat = %.2f, n = %zu\n", __func__, p_txt_sum, p_eog_sum, rat, cur_p->size);
if (3*p_eog_sum*cur_p->size > p_txt_sum) {
LOG_DBG_CUR("%s: the ratio p_txt/p_eog = %.2f is too low -> sampling EOG\n", __func__, p_txt_sum/p_eog_sum);
// keep just the EOG tokens
const auto size_org = cur_p->size;
cur_p->size = 0;
float p_sum = 0.0f;
for (size_t i = 0; i < size_org; ++i) {
if (llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id)) {
p_sum += cur_p->data[i].p;
cur_p->data[cur_p->size++] = cur_p->data[i];
}
}
// normalize probs
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= p_sum;
}
return;
}
size_t n_combined = 0; GGML_UNUSED(n_combined);
// combine tokens with common prefix
for (size_t i0 = 0; i0 < cur_p->size; ++i0) {
for (size_t i1 = 0; i1 < cur_p->size; ++i1) {
if (cur_p->data[i0].logit == -INFINITY) {
break;
}
if (i0 == i1 || cur_p->data[i1].logit == -INFINITY) {
continue;
}
int len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
if (len0 < 0) {
ctx->buf0.resize(len0);
len0 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i0].id, ctx->buf0.data(), ctx->buf0.size(), 0, false);
assert(len0 > 0);
}
int len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
if (len1 < 0) {
ctx->buf1.resize(len1);
len1 = llama_token_to_piece_impl(*ctx->vocab, cur_p->data[i1].id, ctx->buf1.data(), ctx->buf1.size(), 0, false);
assert(len1 > 0);
}
// token i0 is a prefix of token i1
if (len0 > 0 && len0 <= len1 && memcmp(ctx->buf0.data(), ctx->buf1.data(), len0) == 0) {
int dst = i0;
int src = i1;
// merge into the token with higher probability
if (cur_p->data[i1].p > cur_p->data[i0].p) {
std::swap(dst, src);
}
cur_p->data[dst].p += cur_p->data[src].p;
cur_p->data[src].logit = -INFINITY;
cur_p->data[src].p = 0.0f;
n_combined++;
}
}
}
size_t n_non_eog = 0;
size_t size_org = cur_p->size;
float p_sum = 0.0f;
float thold = 0.2f;
cur_p->size = 0;
LOG_DBG_CUR("%s: n_combined = %zu, applying thold = %.3f\n", __func__, n_combined, thold);
for (size_t i = 0; i < size_org; ++i) {
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
if (cur_p->data[i].p < thold && !is_eog) {
continue;
}
if (!is_eog) {
++n_non_eog;
}
p_sum += cur_p->data[i].p;
// keep this token
cur_p->data[cur_p->size++] = cur_p->data[i];
}
LOG_DBG_CUR("%s: n_non_eog = %zu\n", __func__, n_non_eog);
// if no non-EOG tokens are left -> reduce cur_p to single EOT token
if (n_non_eog == 0) {
cur_p->size = 1;
cur_p->data[0].id = llama_token_eot_impl(*ctx->vocab);
cur_p->data[0].logit = 1.0f;
return;
}
// normalize probs
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= p_sum;
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
}
size_org = cur_p->size;
p_sum = 0.0f;
thold = 1.0/(n_non_eog + 1);
cur_p->size = 0;
LOG_DBG_CUR("%s: applying thold = %.3f\n", __func__, thold);
for (size_t i = 0; i < size_org; ++i) {
const bool is_eog = llama_token_is_eog_impl(*ctx->vocab, cur_p->data[i].id);
if (cur_p->data[i].p < thold && !is_eog) {
continue;
}
p_sum += cur_p->data[i].p;
cur_p->data[cur_p->size++] = cur_p->data[i];
}
// normalize probs
for (size_t i = 0; i < cur_p->size; ++i) {
cur_p->data[i].p /= p_sum;
LOG_DBG_CUR("%s: cur_p[%3zu] = { id: %6d, p: %.6f, logit: %6.3f }\n", __func__, i, cur_p->data[i].id, cur_p->data[i].p, cur_p->data[i].logit);
}
#undef LOG_DBG_CUR
}
static struct llama_sampler * llama_sampler_infill_clone(const struct llama_sampler * smpl) {
const auto * ctx = (const llama_sampler_infill *) smpl->ctx;
return llama_sampler_init_infill_impl(*ctx->vocab);
}
static void llama_sampler_infill_free(struct llama_sampler * smpl) {
delete (llama_sampler_infill *) smpl->ctx;
}
static struct llama_sampler_i llama_sampler_infill_i = {
/* .name = */ llama_sampler_infill_name,
/* .accept = */ nullptr,
/* .apply = */ llama_sampler_infill_apply,
/* .reset = */ nullptr,
/* .clone = */ llama_sampler_infill_clone,
/* .free = */ llama_sampler_infill_free,
};
struct llama_sampler * llama_sampler_init_infill_impl(
const struct llama_vocab & vocab) {
return new llama_sampler {
/* .iface = */ &llama_sampler_infill_i,
/* .ctx = */ new llama_sampler_infill {
/* .vocab = */ &vocab,
/* .buf0 = */ std::vector<char>(512),
/* .buf1 = */ std::vector<char>(512),
},
};
}
// utils
uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl) {

View File

@ -4,6 +4,8 @@
#include "llama-grammar.h"
#include <unordered_map>
struct llama_vocab;
struct llama_grammar;
@ -25,24 +27,3 @@ struct llama_sampler * llama_sampler_init_grammar_impl(
const struct llama_vocab & vocab,
const char * grammar_str,
const char * grammar_root);
struct llama_sampler * llama_sampler_init_infill_impl(
const struct llama_vocab & vocab);
struct llama_sampler * llama_sampler_init_dry_impl(
const struct llama_vocab & vocab,
int32_t context_size,
float dry_multiplier,
float dry_base,
int32_t dry_allowed_length,
int32_t dry_penalty_last_n,
const char ** seq_breakers,
size_t num_breakers);
struct llama_sampler * llama_sampler_init_dry_testing(
int32_t context_size,
float dry_multiplier,
float dry_base,
int32_t dry_allowed_length,
int32_t dry_penalty_last_n,
const std::vector<std::vector<llama_token>>& seq_breakers);

View File

@ -221,7 +221,7 @@ struct llm_tokenizer_spm_session {
}
// seed the work queue with all possible 2-character tokens.
for (int i = 1; i < (int) symbols.size(); ++i) {
for (size_t i = 1; i < symbols.size(); ++i) {
try_add_bigram(i - 1, i);
}
@ -563,7 +563,7 @@ struct llm_tokenizer_bpe_session {
index++;
symbols.emplace_back(sym);
}
for (int i = 1; i < (int) symbols.size(); ++i) {
for (size_t i = 1; i < symbols.size(); ++i) {
add_new_bigram(i - 1, i);
}
@ -1663,14 +1663,6 @@ llama_token llama_token_eos_impl(const struct llama_vocab & vocab) {
return vocab.special_eos_id;
}
llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
return vocab.special_eot_id;
}
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
return vocab.special_eom_id;
}
llama_token llama_token_cls_impl(const struct llama_vocab & vocab) {
return vocab.special_cls_id;
}
@ -1696,39 +1688,23 @@ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
}
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_pre_id;
return vocab.special_prefix_id;
}
llama_token llama_token_middle_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_mid_id;
return vocab.special_middle_id;
}
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_suf_id;
return vocab.special_suffix_id;
}
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_pre_id;
llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
return vocab.special_eot_id;
}
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_suf_id;
}
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_mid_id;
}
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_pad_id;
}
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_rep_id;
}
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab) {
return vocab.special_fim_sep_id;
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
return vocab.special_eom_id;
}
int32_t llama_tokenize_impl(
@ -1966,19 +1942,3 @@ int32_t llama_detokenize_impl(
return total <= text_len_max ? total : -total;
}
std::string llama_detokenize(const struct llama_vocab & vocab, const std::vector<llama_token> & tokens, bool special) {
std::string text;
text.resize(std::max(text.capacity(), tokens.size()));
int32_t n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
if (n_chars < 0) {
text.resize(-n_chars);
n_chars = llama_detokenize_impl(vocab, tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
}
text.resize(n_chars);
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
return text;
}

View File

@ -37,26 +37,20 @@ struct llama_vocab {
std::map<std::pair<std::string, std::string>, int> bpe_ranks;
// default LLaMA special tokens
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
id special_bos_id = 1;
id special_eos_id = 2;
id special_eot_id = LLAMA_TOKEN_NULL;
id special_eom_id = LLAMA_TOKEN_NULL;
id special_unk_id = 0;
id special_sep_id = LLAMA_TOKEN_NULL;
id special_pad_id = LLAMA_TOKEN_NULL;
id special_cls_id = LLAMA_TOKEN_NULL;
id special_mask_id = LLAMA_TOKEN_NULL;
id special_sep_id = -1;
id special_pad_id = -1;
id special_cls_id = -1;
id special_mask_id = -1;
id linefeed_id = 13;
// fim tokens
id special_fim_pre_id = LLAMA_TOKEN_NULL;
id special_fim_suf_id = LLAMA_TOKEN_NULL;
id special_fim_mid_id = LLAMA_TOKEN_NULL;
id special_fim_pad_id = LLAMA_TOKEN_NULL;
id special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
id special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
id linefeed_id = 13;
id special_prefix_id = -1;
id special_suffix_id = -1;
id special_middle_id = -1;
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
id special_eom_id = -1;
// set of all tokens that cause "end of generation"
std::set<id> special_eog_ids;
@ -110,26 +104,19 @@ bool llama_token_is_control_impl(const struct llama_vocab & vocab, llama_token t
llama_token llama_token_bos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eos_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl(const struct llama_vocab & vocab);
llama_token llama_token_eom_impl(const struct llama_vocab & vocab);
llama_token llama_token_cls_impl(const struct llama_vocab & vocab);
llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_pre_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_suf_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_mid_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_pad_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_rep_impl(const struct llama_vocab & vocab);
llama_token llama_token_fim_sep_impl(const struct llama_vocab & vocab);
bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
int32_t llama_tokenize_impl(
const struct llama_vocab & vocab,
@ -149,12 +136,6 @@ int32_t llama_token_to_piece_impl(
int32_t lstrip,
bool special);
// check if token0 is contained as a prefix in token1
bool llama_token_is_prefix_impl(
const struct llama_vocab & vocab,
llama_token token0,
llama_token token1);
int32_t llama_detokenize_impl(
const struct llama_vocab & vocab,
const llama_token * tokens,
@ -163,8 +144,3 @@ int32_t llama_detokenize_impl(
int32_t text_len_max,
bool remove_special,
bool unparse_special);
std::string llama_detokenize(
const struct llama_vocab & vocab,
const std::vector<llama_token> & tokens,
bool special);

File diff suppressed because it is too large Load Diff

View File

@ -2,7 +2,6 @@
#define LLAMA_H
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-backend.h"
#include <stddef.h>
@ -185,8 +184,7 @@ extern "C" {
LLAMA_ROPE_SCALING_TYPE_NONE = 0,
LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
LLAMA_ROPE_SCALING_TYPE_YARN = 2,
LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
};
enum llama_pooling_type {
@ -207,7 +205,7 @@ extern "C" {
enum llama_split_mode {
LLAMA_SPLIT_MODE_NONE = 0, // single GPU
LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs
};
// TODO: simplify (https://github.com/ggerganov/llama.cpp/pull/9294#pullrequestreview-2286561979)
@ -219,7 +217,6 @@ extern "C" {
typedef struct llama_token_data_array {
// TODO: consider SoA
// NOTE: this pointer can be modified by the samplers
llama_token_data * data;
size_t size;
int64_t selected; // this is the index in the data array (i.e. not the token id)
@ -235,11 +232,8 @@ extern "C" {
// - token : the token ids of the input (used when embd is NULL)
// - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL)
// - pos : the positions of the respective token in the sequence
// (if set to NULL, the token position will be tracked automatically by llama_decode)
// - seq_id : the sequence to which the respective token belongs
// (if set to NULL, the sequence ID will be assumed to be 0)
// - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output
// (if set to NULL, only the logits for last token will be returned)
//
typedef struct llama_batch {
int32_t n_tokens;
@ -250,6 +244,15 @@ extern "C" {
int32_t * n_seq_id;
llama_seq_id ** seq_id;
int8_t * logits; // TODO: rename this to "output"
// NOTE: helpers for smooth API transition - can be deprecated in the future
// for future-proof code, use the above fields instead and ignore everything below
//
// pos[i] = all_pos_0 + i*all_pos_1
//
llama_pos all_pos_0; // used if pos == NULL
llama_pos all_pos_1; // used if pos == NULL
llama_seq_id all_seq_id; // used if seq_id == NULL
} llama_batch;
enum llama_model_kv_override_type {
@ -273,13 +276,13 @@ extern "C" {
};
struct llama_model_params {
// NULL-terminated list of devices to use for offloading (if NULL, all available devices are used)
ggml_backend_dev_t * devices;
int32_t n_gpu_layers; // number of layers to store in VRAM
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
// the GPU that is used for the entire model when split_mode is LLAMA_SPLIT_MODE_NONE
// main_gpu interpretation depends on split_mode:
// LLAMA_SPLIT_MODE_NONE: the GPU that is used for the entire model
// LLAMA_SPLIT_MODE_ROW: the GPU that is used for small tensors and intermediate results
// LLAMA_SPLIT_MODE_LAYER: ignored
int32_t main_gpu;
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
@ -430,7 +433,6 @@ extern "C" {
LLAMA_API bool llama_supports_mmap (void);
LLAMA_API bool llama_supports_mlock (void);
LLAMA_API bool llama_supports_gpu_offload(void);
LLAMA_API bool llama_supports_rpc (void);
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
@ -671,9 +673,6 @@ extern "C" {
// Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
// Check if the context supports KV cache shifting
LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
//
// State / sessions
//
@ -776,15 +775,15 @@ extern "C" {
// Decoding
//
// Return batch for single sequence of tokens
// The sequence ID will be fixed to 0
// The position of the tokens will be tracked automatically by llama_decode
// Return batch for single sequence of tokens starting at pos_0
//
// NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
//
LLAMA_API struct llama_batch llama_batch_get_one(
llama_token * tokens,
int32_t n_tokens);
int32_t n_tokens,
llama_pos pos_0,
llama_seq_id seq_id);
// Allocates a batch of tokens on the heap that can hold a maximum of n_tokens
// Each token can be assigned up to n_seq_max sequence ids
@ -804,7 +803,7 @@ extern "C" {
// Processes a batch of tokens with the ecoder part of the encoder-decoder model.
// Stores the encoder output internally for later use by the decoder cross-attention layers.
// 0 - success
// < 0 - error. the KV cache state is restored to the state before this call
// < 0 - error
LLAMA_API int32_t llama_encode(
struct llama_context * ctx,
struct llama_batch batch);
@ -812,7 +811,7 @@ extern "C" {
// Positive return values does not mean a fatal error, but rather a warning.
// 0 - success
// 1 - could not find a KV slot for the batch (try reducing the size of the batch or increase the context)
// < 0 - error. the KV cache state is restored to the state before this call
// < 0 - error
LLAMA_API int32_t llama_decode(
struct llama_context * ctx,
struct llama_batch batch);
@ -897,7 +896,6 @@ extern "C" {
// Special tokens
LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn
LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification
LLAMA_API llama_token llama_token_sep(const struct llama_model * model); // sentence separator
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
@ -906,17 +904,11 @@ extern "C" {
LLAMA_API bool llama_add_bos_token(const struct llama_model * model);
LLAMA_API bool llama_add_eos_token(const struct llama_model * model);
// infill tokens
DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead");
DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead");
DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead");
LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model);
LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model);
// Codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle
//
// Tokenization
@ -991,9 +983,6 @@ extern "C" {
char * buf,
int32_t length);
// Get list of built-in chat templates
LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
//
// Sampling API
//
@ -1078,13 +1067,12 @@ extern "C" {
// available samplers:
LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
LLAMA_API struct llama_sampler * llama_sampler_init_greedy (void);
LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed);
/// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
/// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first.
DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void),
"will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)");
LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k);
@ -1095,18 +1083,16 @@ extern "C" {
/// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841
LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API struct llama_sampler * llama_sampler_init_tail_free (float z, size_t min_keep);
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep);
/// #details Updates the logits l_i` = l_i/t. When t <= 0.0f, the maximum logit is kept at it's original value, the rest are set to -inf
LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t);
/// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772.
LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent);
/// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335
LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
@ -1146,43 +1132,11 @@ extern "C" {
bool penalize_nl, // consider newlines as a repeatable token
bool ignore_eos); // ignore the end-of-sequence token
/// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
LLAMA_API struct llama_sampler * llama_sampler_init_dry(
const struct llama_model * model,
float dry_multiplier,
float dry_base,
int32_t dry_allowed_length,
int32_t dry_penalty_last_n,
const char ** seq_breakers,
size_t num_breakers);
LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias(
int32_t n_vocab,
int32_t n_logit_bias,
const llama_logit_bias * logit_bias);
// this sampler is meant to be used for fill-in-the-middle infilling
// it's supposed to be used after top_k + top_p sampling
//
// 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG
// 2. combine probs of tokens that have the same prefix
//
// example:
//
// - before:
// "hel": 0.5
// "hell": 0.2
// "hello": 0.1
// "dummy": 0.1
//
// - after:
// "hel": 0.8
// "dummy": 0.1
//
// 3. discard non-EOG tokens with low prob
// 4. if no tokens are left -> pick EOT
//
LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model);
// Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise
LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl);
@ -1254,6 +1208,8 @@ extern "C" {
LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain);
LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain);
LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx);
#ifdef __cplusplus
}
#endif

View File

@ -2311,7 +2311,7 @@ const std::unordered_set<uint32_t> unicode_set_whitespace = {
0x003000,
};
// list is always in ascending order, to enable binary search
// list is always in ascending order, to enable binary searh
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase = {
{0x000041, 0x000061},
{0x000042, 0x000062},
@ -3748,7 +3748,7 @@ const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_lowercase
{0x01E921, 0x01E943},
};
// list is always in ascending order, to enable binary search
// list is always in ascending order, to enable binary searh
const std::initializer_list<std::pair<uint32_t, uint32_t>> unicode_map_uppercase = {
{0x000061, 0x000041},
{0x000062, 0x000042},

View File

@ -201,18 +201,7 @@ static std::unordered_map<std::string, uint8_t> unicode_utf8_to_byte_map() {
}
static inline std::wstring unicode_wstring_from_utf8(const std::string & s) {
#if defined(__clang__)
// disable C++17 deprecation warning for std::codecvt_utf8
# pragma clang diagnostic push
# pragma clang diagnostic ignored "-Wdeprecated-declarations"
#endif
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
#if defined(__clang__)
# pragma clang diagnostic pop
#endif
return conv.from_bytes(s);
}

View File

@ -2,11 +2,11 @@ cmake_minimum_required(VERSION 3.10)
project(whisper.cpp)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD 11)
set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../..)
# Path to external GGML, otherwise uses the copy in whisper.cpp.
option(GGML_HOME "whisper: Path to external GGML source" OFF)
option(GGML_HOME "whisper: Path to external GGML source" OFF)
set(
SOURCE_FILES
@ -14,8 +14,6 @@ set(
${CMAKE_SOURCE_DIR}/jni.c
)
# TODO: this needs to be updated to work with the new ggml CMakeLists
if (NOT GGML_HOME)
set(
SOURCE_FILES
@ -24,13 +22,7 @@ if (NOT GGML_HOME)
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
${WHISPER_LIB_DIR}/ggml/src/ggml-backend-reg.cpp
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
${WHISPER_LIB_DIR}/ggml/src/ggml-threading.cpp
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.c
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.cpp
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-aarch64.c
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-quants.c
)
endif()

View File

@ -24,12 +24,6 @@
18A2760B2C2A9B43001C8D37 /* ggml-metal.metal in Resources */ = {isa = PBXBuildFile; fileRef = 1844471D2AB2195F007D6BFE /* ggml-metal.metal */; };
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */ = {isa = PBXBuildFile; fileRef = 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */; };
18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */; };
18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */; };
18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */; };
18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */; };
18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */; };
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */; };
@ -55,8 +49,8 @@
18133C7F2C64E342005CEAAC /* ggml-aarch64.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-aarch64.c"; path = "../../../ggml/src/ggml-aarch64.c"; sourceTree = "<group>"; };
184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal/ggml-metal.m"; sourceTree = "<group>"; };
1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal/ggml-metal.metal"; sourceTree = "<group>"; };
1844471B2AB21655007D6BFE /* ggml-metal.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; name = "ggml-metal.m"; path = "../../../ggml/src/ggml-metal.m"; sourceTree = "<group>"; };
1844471D2AB2195F007D6BFE /* ggml-metal.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; name = "ggml-metal.metal"; path = "../../../ggml/src/ggml-metal.metal"; sourceTree = "<group>"; };
18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; };
18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
@ -82,17 +76,6 @@
18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-threading.cpp"; path = "../../../ggml/src/ggml-threading.cpp"; sourceTree = "<group>"; };
18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-cpu.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.cpp"; sourceTree = "<group>"; };
18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-aarch64.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.h"; sourceTree = "<group>"; };
18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-aarch64.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.c"; sourceTree = "<group>"; };
18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-impl.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-impl.h"; sourceTree = "<group>"; };
18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-quants.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.h"; sourceTree = "<group>"; };
18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-quants.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.c"; sourceTree = "<group>"; };
18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-backend-reg.cpp"; path = "../../../ggml/src/ggml-backend-reg.cpp"; sourceTree = "<group>"; };
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = "<group>"; };
7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = "<group>"; };
@ -132,17 +115,6 @@
18627C7829052BDF00BD2A04 /* whisper.objc */ = {
isa = PBXGroup;
children = (
18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */,
18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.c */,
18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */,
18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */,
18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */,
18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */,
18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */,
18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */,
18E864AA2CE73C580094B8B3 /* ggml-cpu.h */,
18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */,
18133C7F2C64E342005CEAAC /* ggml-aarch64.c */,
18133C7E2C64E342005CEAAC /* ggml-aarch64.h */,
18A275FF2C2A9563001C8D37 /* ggml-common.h */,
@ -275,16 +247,10 @@
18627C9629052C5800BD2A04 /* ggml.c in Sources */,
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */,
18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */,
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.c in Sources */,
18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */,
18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */,
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
18627C8C29052BE000BD2A04 /* main.m in Sources */,
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */,
1844471C2AB21655007D6BFE /* ggml-metal.m in Sources */,
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */,
);
@ -363,7 +329,6 @@
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
HEADER_SEARCH_PATHS = ../../../ggml/src/;
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
MTL_FAST_MATH = YES;
@ -417,7 +382,6 @@
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
GCC_WARN_UNUSED_FUNCTION = YES;
GCC_WARN_UNUSED_VARIABLE = YES;
HEADER_SEARCH_PATHS = ../../../ggml/src/;
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
MTL_ENABLE_DEBUG_INFO = NO;
MTL_FAST_MATH = YES;
@ -440,7 +404,6 @@
DEVELOPMENT_TEAM = P8JZH34X63;
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
GENERATE_INFOPLIST_FILE = YES;
HEADER_SEARCH_PATHS = ../../../ggml/src/;
INFOPLIST_FILE = whisper.objc/Info.plist;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
@ -470,7 +433,6 @@
DEVELOPMENT_TEAM = P8JZH34X63;
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
GENERATE_INFOPLIST_FILE = YES;
HEADER_SEARCH_PATHS = ../../../ggml/src/;
INFOPLIST_FILE = whisper.objc/Info.plist;
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;

View File

@ -1,5 +1,4 @@
import Foundation
import UIKit
import whisper
enum WhisperError: Error {
@ -56,91 +55,11 @@ actor WhisperContext {
return transcription
}
static func benchMemcpy(nThreads: Int32) async -> String {
return String.init(cString: whisper_bench_memcpy_str(nThreads))
}
static func benchGgmlMulMat(nThreads: Int32) async -> String {
return String.init(cString: whisper_bench_ggml_mul_mat_str(nThreads))
}
private func systemInfo() -> String {
var info = ""
//if (ggml_cpu_has_neon() != 0) { info += "NEON " }
return String(info.dropLast())
}
func benchFull(modelName: String, nThreads: Int32) async -> String {
let nMels = whisper_model_n_mels(context)
if (whisper_set_mel(context, nil, 0, nMels) != 0) {
return "error: failed to set mel"
}
// heat encoder
if (whisper_encode(context, 0, nThreads) != 0) {
return "error: failed to encode"
}
var tokens = [whisper_token](repeating: 0, count: 512)
// prompt heat
if (whisper_decode(context, &tokens, 256, 0, nThreads) != 0) {
return "error: failed to decode"
}
// text-generation heat
if (whisper_decode(context, &tokens, 1, 256, nThreads) != 0) {
return "error: failed to decode"
}
whisper_reset_timings(context)
// actual run
if (whisper_encode(context, 0, nThreads) != 0) {
return "error: failed to encode"
}
// text-generation
for i in 0..<256 {
if (whisper_decode(context, &tokens, 1, Int32(i), nThreads) != 0) {
return "error: failed to decode"
}
}
// batched decoding
for _ in 0..<64 {
if (whisper_decode(context, &tokens, 5, 0, nThreads) != 0) {
return "error: failed to decode"
}
}
// prompt processing
for _ in 0..<16 {
if (whisper_decode(context, &tokens, 256, 0, nThreads) != 0) {
return "error: failed to decode"
}
}
whisper_print_timings(context)
let deviceModel = await UIDevice.current.model
let systemName = await UIDevice.current.systemName
let systemInfo = self.systemInfo()
let timings: whisper_timings = whisper_get_timings(context).pointee
let encodeMs = String(format: "%.2f", timings.encode_ms)
let decodeMs = String(format: "%.2f", timings.decode_ms)
let batchdMs = String(format: "%.2f", timings.batchd_ms)
let promptMs = String(format: "%.2f", timings.prompt_ms)
return "| \(deviceModel) | \(systemName) | \(systemInfo) | \(modelName) | \(nThreads) | 1 | \(encodeMs) | \(decodeMs) | \(batchdMs) | \(promptMs) | <todo> |"
}
static func createContext(path: String) throws -> WhisperContext {
var params = whisper_context_default_params()
#if targetEnvironment(simulator)
params.use_gpu = false
print("Running on the simulator, using CPU")
#else
params.flash_attn = true // Enabled by default for Metal
#endif
let context = whisper_init_from_file_with_params(path, params)
if let context {

View File

@ -1,17 +0,0 @@
import Foundation
struct Model: Identifiable {
var id = UUID()
var name: String
var info: String
var url: String
var filename: String
var fileURL: URL {
FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename)
}
func fileExists() -> Bool {
FileManager.default.fileExists(atPath: fileURL.path)
}
}

View File

@ -14,7 +14,7 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
private var recordedFile: URL? = nil
private var audioPlayer: AVAudioPlayer?
private var builtInModelUrl: URL? {
private var modelUrl: URL? {
Bundle.main.url(forResource: "ggml-base.en", withExtension: "bin", subdirectory: "models")
}
@ -28,59 +28,23 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
override init() {
super.init()
loadModel()
}
func loadModel(path: URL? = nil, log: Bool = true) {
do {
whisperContext = nil
if (log) { messageLog += "Loading model...\n" }
let modelUrl = path ?? builtInModelUrl
if let modelUrl {
whisperContext = try WhisperContext.createContext(path: modelUrl.path())
if (log) { messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" }
} else {
if (log) { messageLog += "Could not locate model\n" }
}
try loadModel()
canTranscribe = true
} catch {
print(error.localizedDescription)
if (log) { messageLog += "\(error.localizedDescription)\n" }
messageLog += "\(error.localizedDescription)\n"
}
}
func benchCurrentModel() async {
if whisperContext == nil {
messageLog += "Cannot bench without loaded model\n"
return
private func loadModel() throws {
messageLog += "Loading model...\n"
if let modelUrl {
whisperContext = try WhisperContext.createContext(path: modelUrl.path())
messageLog += "Loaded model \(modelUrl.lastPathComponent)\n"
} else {
messageLog += "Could not locate model\n"
}
messageLog += "Running benchmark for loaded model\n"
let result = await whisperContext?.benchFull(modelName: "<current>", nThreads: Int32(min(4, cpuCount())))
if (result != nil) { messageLog += result! + "\n" }
}
func bench(models: [Model]) async {
let nThreads = Int32(min(4, cpuCount()))
// messageLog += "Running memcpy benchmark\n"
// messageLog += await WhisperContext.benchMemcpy(nThreads: nThreads) + "\n"
//
// messageLog += "Running ggml_mul_mat benchmark with \(nThreads) threads\n"
// messageLog += await WhisperContext.benchGgmlMulMat(nThreads: nThreads) + "\n"
messageLog += "Running benchmark for all downloaded models\n"
messageLog += "| CPU | OS | Config | Model | Th | FA | Enc. | Dec. | Bch5 | PP | Commit |\n"
messageLog += "| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |\n"
for model in models {
loadModel(path: model.fileURL, log: false)
if whisperContext == nil {
messageLog += "Cannot bench without loaded model\n"
break
}
let result = await whisperContext?.benchFull(modelName: model.name, nThreads: nThreads)
if (result != nil) { messageLog += result! + "\n" }
}
messageLog += "Benchmarking completed\n"
}
func transcribeSample() async {
@ -196,8 +160,3 @@ class WhisperState: NSObject, ObservableObject, AVAudioRecorderDelegate {
isRecording = false
}
}
fileprivate func cpuCount() -> Int {
ProcessInfo.processInfo.processorCount
}

View File

@ -1,6 +1,5 @@
import SwiftUI
import AVFoundation
import Foundation
struct ContentView: View {
@StateObject var whisperState = WhisperState()
@ -30,125 +29,15 @@ struct ContentView: View {
Text(verbatim: whisperState.messageLog)
.frame(maxWidth: .infinity, alignment: .leading)
}
.font(.footnote)
.padding()
.background(Color.gray.opacity(0.1))
.cornerRadius(10)
HStack {
Button("Clear Logs", action: {
whisperState.messageLog = ""
})
.font(.footnote)
.buttonStyle(.bordered)
Button("Copy Logs", action: {
UIPasteboard.general.string = whisperState.messageLog
})
.font(.footnote)
.buttonStyle(.bordered)
Button("Bench", action: {
Task {
await whisperState.benchCurrentModel()
}
})
.font(.footnote)
.buttonStyle(.bordered)
.disabled(!whisperState.canTranscribe)
Button("Bench All", action: {
Task {
await whisperState.bench(models: ModelsView.getDownloadedModels())
}
})
.font(.footnote)
.buttonStyle(.bordered)
.disabled(!whisperState.canTranscribe)
}
NavigationLink(destination: ModelsView(whisperState: whisperState)) {
Text("View Models")
}
.font(.footnote)
.padding()
}
.navigationTitle("Whisper SwiftUI Demo")
.padding()
}
}
struct ModelsView: View {
@ObservedObject var whisperState: WhisperState
@Environment(\.dismiss) var dismiss
private static let models: [Model] = [
Model(name: "tiny", info: "(F16, 75 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.bin", filename: "tiny.bin"),
Model(name: "tiny-q5_1", info: "(31 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-q5_1.bin", filename: "tiny-q5_1.bin"),
Model(name: "tiny-q8_0", info: "(42 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny-q8_0.bin", filename: "tiny-q8_0.bin"),
Model(name: "tiny.en", info: "(F16, 75 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin", filename: "tiny.en.bin"),
Model(name: "tiny.en-q5_1", info: "(31 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q5_1.bin", filename: "tiny.en-q5_1.bin"),
Model(name: "tiny.en-q8_0", info: "(42 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en-q8_0.bin", filename: "tiny.en-q8_0.bin"),
Model(name: "base", info: "(F16, 142 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin", filename: "base.bin"),
Model(name: "base-q5_1", info: "(57 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base-q5_1.bin", filename: "base-q5_1.bin"),
Model(name: "base-q8_0", info: "(78 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base-q8_0.bin", filename: "base-q8_0.bin"),
Model(name: "base.en", info: "(F16, 142 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin", filename: "base.en.bin"),
Model(name: "base.en-q5_1", info: "(57 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en-q5_1.bin", filename: "base.en-q5_1.bin"),
Model(name: "base.en-q8_0", info: "(78 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en-q8_0.bin", filename: "base.en-q8_0.bin"),
Model(name: "small", info: "(F16, 466 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.bin", filename: "small.bin"),
Model(name: "small-q5_1", info: "(181 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small-q5_1.bin", filename: "small-q5_1.bin"),
Model(name: "small-q8_0", info: "(252 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small-q8_0.bin", filename: "small-q8_0.bin"),
Model(name: "small.en", info: "(F16, 466 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin", filename: "small.en.bin"),
Model(name: "small.en-q5_1", info: "(181 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en-q5_1.bin", filename: "small.en-q5_1.bin"),
Model(name: "small.en-q8_0", info: "(252 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en-q8_0.bin", filename: "small.en-q8_0.bin"),
Model(name: "medium", info: "(F16, 1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.bin", filename: "medium.bin"),
Model(name: "medium-q5_0", info: "(514 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q5_0.bin", filename: "medium-q5_0.bin"),
Model(name: "medium-q8_0", info: "(785 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium-q8_0.bin", filename: "medium-q8_0.bin"),
Model(name: "medium.en", info: "(F16, 1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin", filename: "medium.en.bin"),
Model(name: "medium.en-q5_0", info: "(514 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en-q5_0.bin", filename: "medium.en-q5_0.bin"),
Model(name: "medium.en-q8_0", info: "(785 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en-q8_0.bin", filename: "medium.en-q8_0.bin"),
Model(name: "large-v1", info: "(F16, 2.9 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large.bin", filename: "large.bin"),
Model(name: "large-v2", info: "(F16, 2.9 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2.bin", filename: "large-v2.bin"),
Model(name: "large-v2-q5_0", info: "(1.1 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2-q5_0.bin", filename: "large-v2-q5_0.bin"),
Model(name: "large-v2-q8_0", info: "(1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v2-q8_0.bin", filename: "large-v2-q8_0.bin"),
Model(name: "large-v3", info: "(F16, 2.9 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin", filename: "large-v3.bin"),
Model(name: "large-v3-q5_0", info: "(1.1 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-q5_0.bin", filename: "large-v3-q5_0.bin"),
Model(name: "large-v3-turbo", info: "(F16, 1.5 GiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo.bin", filename: "large-v3-turbo.bin"),
Model(name: "large-v3-turbo-q5_0", info: "(547 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q5_0.bin", filename: "large-v3-turbo-q5_0.bin"),
Model(name: "large-v3-turbo-q8_0", info: "(834 MiB)", url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3-turbo-q8_0.bin", filename: "large-v3-turbo-q8_0.bin"),
]
static func getDownloadedModels() -> [Model] {
// Filter models that have been downloaded
return models.filter {
FileManager.default.fileExists(atPath: $0.fileURL.path())
}
}
func loadModel(model: Model) {
Task {
dismiss()
whisperState.loadModel(path: model.fileURL)
}
}
var body: some View {
List {
Section(header: Text("Models")) {
ForEach(ModelsView.models) { model in
DownloadButton(model: model)
.onLoad(perform: loadModel)
}
}
}
.listStyle(GroupedListStyle())
.navigationBarTitle("Models", displayMode: .inline).toolbar {}
}
}
}
//struct ContentView_Previews: PreviewProvider {
// static var previews: some View {
// ContentView()
// }
//}
struct ContentView_Previews: PreviewProvider {
static var previews: some View {
ContentView()
}
}

View File

@ -1,102 +0,0 @@
import SwiftUI
struct DownloadButton: View {
private var model: Model
@State private var status: String
@State private var downloadTask: URLSessionDownloadTask?
@State private var progress = 0.0
@State private var observation: NSKeyValueObservation?
private var onLoad: ((_ model: Model) -> Void)?
init(model: Model) {
self.model = model
status = model.fileExists() ? "downloaded" : "download"
}
func onLoad(perform action: @escaping (_ model: Model) -> Void) -> DownloadButton {
var button = self
button.onLoad = action
return button
}
private func download() {
status = "downloading"
print("Downloading model \(model.name) from \(model.url)")
guard let url = URL(string: model.url) else { return }
downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in
if let error = error {
print("Error: \(error.localizedDescription)")
return
}
guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else {
print("Server error!")
return
}
do {
if let temporaryURL = temporaryURL {
try FileManager.default.copyItem(at: temporaryURL, to: model.fileURL)
print("Writing to \(model.filename) completed")
status = "downloaded"
}
} catch let err {
print("Error: \(err.localizedDescription)")
}
}
observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in
self.progress = progress.fractionCompleted
}
downloadTask?.resume()
}
var body: some View {
VStack {
Button(action: {
if (status == "download") {
download()
} else if (status == "downloading") {
downloadTask?.cancel()
status = "download"
} else if (status == "downloaded") {
if !model.fileExists() {
download()
}
onLoad?(model)
}
}) {
let title = "\(model.name) \(model.info)"
if (status == "download") {
Text("Download \(title)")
} else if (status == "downloading") {
Text("\(title) (Downloading \(Int(progress * 100))%)")
} else if (status == "downloaded") {
Text("Load \(title)")
} else {
Text("Unknown status")
}
}.swipeActions {
if (status == "downloaded") {
Button("Delete") {
do {
try FileManager.default.removeItem(at: model.fileURL)
} catch {
print("Error deleting file: \(error)")
}
status = "download"
}
.tint(.red)
}
}
}
.onDisappear() {
downloadTask?.cancel()
}
}
}

View File

@ -17,8 +17,6 @@
0AAC5D9F29539CD0003032C3 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 0AAC5D9E29539CD0003032C3 /* Assets.xcassets */; };
0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DCD2953A05C003032C3 /* WhisperState.swift */; };
0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0AAC5DD02953A394003032C3 /* LibWhisper.swift */; };
7F79E0EE2CE0A78000ACD7BF /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7F79E0ED2CE0A78000ACD7BF /* DownloadButton.swift */; };
7F79E0F02CE0C6F700ACD7BF /* Model.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7F79E0EF2CE0C6F700ACD7BF /* Model.swift */; };
E3F92DC52AFA8E3800A6A9D4 /* whisper in Frameworks */ = {isa = PBXBuildFile; productRef = E3F92DC42AFA8E3800A6A9D4 /* whisper */; };
/* End PBXBuildFile section */
@ -35,8 +33,6 @@
0AAC5DA029539CD0003032C3 /* WhisperCppDemo.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = WhisperCppDemo.entitlements; sourceTree = "<group>"; };
0AAC5DCD2953A05C003032C3 /* WhisperState.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = WhisperState.swift; sourceTree = "<group>"; };
0AAC5DD02953A394003032C3 /* LibWhisper.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LibWhisper.swift; sourceTree = "<group>"; };
7F79E0ED2CE0A78000ACD7BF /* DownloadButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = "<group>"; };
7F79E0EF2CE0C6F700ACD7BF /* Model.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Model.swift; sourceTree = "<group>"; };
E3F92DC22AFA8DD800A6A9D4 /* whisper.cpp */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = whisper.cpp; path = ../..; sourceTree = "<group>"; };
/* End PBXFileReference section */
@ -56,7 +52,6 @@
isa = PBXGroup;
children = (
0AAC5DCD2953A05C003032C3 /* WhisperState.swift */,
7F79E0EF2CE0C6F700ACD7BF /* Model.swift */,
);
path = Models;
sourceTree = "<group>";
@ -124,7 +119,6 @@
isa = PBXGroup;
children = (
0AAC5D9C29539CCF003032C3 /* ContentView.swift */,
7F79E0ED2CE0A78000ACD7BF /* DownloadButton.swift */,
);
path = UI;
sourceTree = "<group>";
@ -226,9 +220,7 @@
0AAC5DCE2953A05C003032C3 /* WhisperState.swift in Sources */,
0AAC5DD12953A394003032C3 /* LibWhisper.swift in Sources */,
0AA7514C2953B569001EE061 /* RiffWaveUtils.swift in Sources */,
7F79E0EE2CE0A78000ACD7BF /* DownloadButton.swift in Sources */,
0AA7514E2953D958001EE061 /* Recorder.swift in Sources */,
7F79E0F02CE0C6F700ACD7BF /* Model.swift in Sources */,
);
runOnlyForDeploymentPostprocessing = 0;
};
@ -378,9 +370,7 @@
PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
PRODUCT_NAME = "$(TARGET_NAME)";
SDKROOT = auto;
SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
SUPPORTS_MACCATALYST = NO;
SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = YES;
SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_OPTIMIZATION_LEVEL = "-Onone";
SWIFT_VERSION = 5.0;
@ -425,9 +415,7 @@
PRODUCT_BUNDLE_IDENTIFIER = com.whispercppdemo.WhisperCppDemo;
PRODUCT_NAME = "$(TARGET_NAME)";
SDKROOT = auto;
SUPPORTED_PLATFORMS = "iphoneos iphonesimulator";
SUPPORTS_MACCATALYST = NO;
SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = YES;
SUPPORTED_PLATFORMS = "iphoneos iphonesimulator macosx";
SWIFT_EMIT_LOC_STRINGS = YES;
SWIFT_VERSION = 5.0;
TARGETED_DEVICE_FAMILY = "1,2";

View File

@ -33,7 +33,6 @@ else()
endif()
option(BUILD_SHARED_LIBS "ggml: build shared libraries" ${BUILD_SHARED_LIBS_DEFAULT})
option(GGML_BACKEND_DL "ggml: build backends as dynamic libraries (requires BUILD_SHARED_LIBS)" OFF)
#
# option list
@ -92,38 +91,28 @@ else()
set(INS_ENB ON)
endif()
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX_VNNI "ggml: enable AVX-VNNI" OFF)
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
option(GGML_AVX512 "ggml: enable AVX512F" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
if (NOT MSVC)
# in MSVC F16C and FMA is implied with AVX2/AVX512
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
option(GGML_F16C "ggml: enable F16C" ${INS_ENB})
# MSVC does not seem to support AMX
option(GGML_AMX_TILE "ggml: enable AMX-TILE" OFF)
option(GGML_AMX_INT8 "ggml: enable AMX-INT8" OFF)
option(GGML_AMX_BF16 "ggml: enable AMX-BF16" OFF)
endif()
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_RVV "ggml: enable rvv" ON)
option(GGML_SVE "ggml: enable SVE" OFF)
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
option(GGML_AVX "ggml: enable AVX" ${INS_ENB})
option(GGML_AVX2 "ggml: enable AVX2" ${INS_ENB})
option(GGML_AVX512 "ggml: enable AVX512" OFF)
option(GGML_AVX512_VBMI "ggml: enable AVX512-VBMI" OFF)
option(GGML_AVX512_VNNI "ggml: enable AVX512-VNNI" OFF)
option(GGML_AVX512_BF16 "ggml: enable AVX512-BF16" OFF)
option(GGML_FMA "ggml: enable FMA" ${INS_ENB})
if (NOT MSVC)
option(GGML_F16C "ggml: enable F16C" ${INS_ENB}) # in MSVC F16C is implied with AVX2/AVX512
endif()
option(GGML_LASX "ggml: enable lasx" ON)
option(GGML_LSX "ggml: enable lsx" ON)
option(GGML_SVE "ggml: enable SVE" OFF)
if (WIN32)
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows version")
set(GGML_WIN_VER "0x602" CACHE STRING "ggml: Windows Version")
endif()
# ggml core
set(GGML_SCHED_MAX_COPIES "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
option(GGML_CPU "ggml: enable CPU backend" ON)
# 3rd party libs / backends
option(GGML_ACCELERATE "ggml: enable Accelerate framework" ON)
@ -134,9 +123,14 @@ option(GGML_LLAMAFILE "ggml: use LLAMAFILE"
option(GGML_CUDA "ggml: use CUDA" OFF)
option(GGML_MUSA "ggml: use MUSA" OFF)
option(GGML_CUDA_FORCE_DMMV "ggml: use dmmv instead of mmvq CUDA kernels" OFF)
option(GGML_CUDA_FORCE_MMQ "ggml: use mmq kernels instead of cuBLAS" OFF)
option(GGML_CUDA_FORCE_CUBLAS "ggml: always use cuBLAS instead of mmq kernels" OFF)
set (GGML_CUDA_DMMV_X "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
set (GGML_CUDA_MMV_Y "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
option(GGML_CUDA_F16 "ggml: use 16 bit floats for some calculations" OFF)
set (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
"ggml: iters./thread per block for Q2_K/Q6_K")
set (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"ggml: max. batch size for using peer access")
option(GGML_CUDA_NO_PEER_COPY "ggml: do not use peer to peer copies" OFF)
@ -144,7 +138,7 @@ option(GGML_CUDA_NO_VMM "ggml: do not try to use CUDA VMM"
option(GGML_CUDA_FA_ALL_QUANTS "ggml: compile all quants for FlashAttention" OFF)
option(GGML_CUDA_GRAPHS "ggml: use CUDA graphs (llama.cpp only)" ${GGML_CUDA_GRAPHS_DEFAULT})
option(GGML_HIP "ggml: use HIP" OFF)
option(GGML_HIPBLAS "ggml: use hipBLAS" OFF)
option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
@ -156,7 +150,6 @@ option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation"
option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF)
option(GGML_KOMPUTE "ggml: use Kompute" OFF)
option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT})
option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF)
option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF)
option(GGML_METAL_SHADER_DEBUG "ggml: compile Metal with -fno-fast-math" OFF)
option(GGML_METAL_EMBED_LIBRARY "ggml: embed Metal library" ${GGML_METAL})
@ -169,8 +162,6 @@ option(GGML_SYCL "ggml: use SYCL"
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
"ggml: sycl target device")
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
"ggml: sycl device architecture")
# extra artifacts
option(GGML_BUILD_TESTS "ggml: build tests" ${GGML_STANDALONE})
@ -183,7 +174,11 @@ option(GGML_BUILD_EXAMPLES "ggml: build examples" ${GGML_STANDALONE})
set(CMAKE_C_STANDARD 11)
set(CMAKE_C_STANDARD_REQUIRED true)
set(CMAKE_CXX_STANDARD 17)
if (GGML_SYCL)
set(CMAKE_CXX_STANDARD 17)
else()
set(CMAKE_CXX_STANDARD 11)
endif()
set(CMAKE_CXX_STANDARD_REQUIRED true)
set(THREADS_PREFER_PTHREAD_FLAG ON)
@ -219,14 +214,13 @@ include(CMakePackageConfigHelpers)
# all public headers
set(GGML_PUBLIC_HEADERS
include/ggml.h
include/ggml-cpu.h
include/ggml-alloc.h
include/ggml-backend.h
include/ggml-blas.h
include/ggml-cann.h
include/ggml-cuda.h
include/ggml.h
include/ggml-kompute.h
include/ggml-opt.h
include/ggml-metal.h
include/ggml-rpc.h
include/ggml-sycl.h
@ -236,14 +230,15 @@ set_target_properties(ggml PROPERTIES PUBLIC_HEADER "${GGML_PUBLIC_HEADERS}")
#if (GGML_METAL)
# set_target_properties(ggml PROPERTIES RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/src/ggml-metal.metal")
#endif()
install(TARGETS ggml LIBRARY PUBLIC_HEADER)
install(TARGETS ggml-base LIBRARY)
install(TARGETS ggml PUBLIC_HEADER)
if (BUILD_SHARED_LIBS)
install(TARGETS ggml LIBRARY)
endif()
# FIXME: this should be done in the backend cmake files
if (GGML_METAL)
# FIXME: does this need to be installed with GGML_METAL_EMBED_LIBRARY?
install(
FILES src/ggml-metal/ggml-metal.metal
FILES src/ggml-metal.metal
PERMISSIONS
OWNER_READ
OWNER_WRITE

View File

@ -0,0 +1,220 @@
#!/usr/bin/env python
import logging
import argparse
import asyncio
import os
from tempfile import gettempdir
logger = logging.getLogger("ggml-vk-generate-shaders")
GLSLC = "glslc"
type_names = [
"f32",
"f16",
"q4_0",
"q4_1",
"q5_0",
"q5_1",
"q8_0",
"q2_k",
"q3_k",
"q4_k",
"q5_k",
"q6_k",
]
ASYNCIO_CONCURRENCY = 64
input_dir = "vulkan-shaders"
output_dir = gettempdir()
lock = asyncio.Lock()
shader_fnames = []
async def string_to_spv(name, in_fname, defines, fp16=True):
name = f"{name}{'_fp32' if not fp16 else ''}"
out_fname = os.path.join(output_dir, f"{name}.spv")
in_path = os.path.join(input_dir, in_fname)
cmd = [GLSLC, "-fshader-stage=compute", "--target-env=vulkan1.2", "-O", in_path, "-o", out_fname]
cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
proc = await asyncio.create_subprocess_exec(*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
stdout, stderr = await proc.communicate()
stdout = stdout.decode()
error = stderr.decode()
if proc.returncode:
cmd = " ".join(cmd)
logger.error(f"cannot compile {name}\n\n{cmd}\n\n{error}")
return
async with lock:
shader_fnames.append((name, out_fname))
def matmul_shaders(tasks, fp16, matmul_id):
if fp16:
load_vec = "8"
aligned_b_type_f32 = "mat2x4"
aligned_b_type_f16 = "f16mat2x4"
else:
load_vec = "4"
aligned_b_type_f32 = "vec4"
aligned_b_type_f16 = "f16vec4"
base_dict = {"FLOAT_TYPE": "float" if not fp16 else "float16_t"}
shader_name = "matmul"
if matmul_id:
base_dict["MUL_MAT_ID"] = "1"
shader_name = "matmul_id"
if fp16:
base_dict["FLOAT16"] = "1"
# Shaders with f16 B_TYPE
tasks.append(string_to_spv(f"{shader_name}_f32_f16", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_f32_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F32": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_f16", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "B_TYPE": "float16_t", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_f16_aligned", "mul_mm.comp", base_dict | {"DATA_A_F16": "1", "LOAD_VEC_A": load_vec, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f16, "D_TYPE": "float"}, fp16))
for tname in type_names:
data_a_key = f"DATA_A_{tname.upper()}"
load_vec_a = load_vec if tname in ("f32", "f16") else "2"
tasks.append(string_to_spv(f"{shader_name}_{tname}_f32", "mul_mm.comp", base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}, fp16))
tasks.append(string_to_spv(f"{shader_name}_{tname}_f32_aligned", "mul_mm.comp", base_dict | {data_a_key: "2", "LOAD_VEC_A": load_vec_a, "LOAD_VEC_B": load_vec, "B_TYPE": aligned_b_type_f32, "D_TYPE": "float"}, fp16))
async def main():
logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")
tasks = []
for fp16 in (False, True):
# MUL_MAT
matmul_shaders(tasks, fp16, False)
# MUL_MAT_ID
matmul_shaders(tasks, fp16, True)
for tname in type_names:
base_dict = {"FLOAT_TYPE": "float"}
# mul mat vec
data_a_key = f"DATA_A_{tname.upper()}"
shader = f"mul_mat_vec_{tname}.comp" if tname.endswith("_k") else "mul_mat_vec.comp"
tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f32_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv(f"mul_mat_vec_{tname}_f16_f32", shader, base_dict | {data_a_key: "1", "B_TYPE": "float16_t", "D_TYPE": "float"}))
tasks.append(string_to_spv(f"mul_mat_vec_id_{tname}_f32", shader, base_dict | {"MUL_MAT_ID": "1", data_a_key: "1", "B_TYPE": "float", "D_TYPE": "float"}))
# Dequant shaders
if tname != "f16":
tasks.append(string_to_spv(f"dequant_{tname}", f"dequant_{tname}.comp", base_dict | {data_a_key: "1", "D_TYPE": "float16_t"}))
# get_rows
if not tname.endswith("_k"):
shader = "get_rows.comp" if tname in ("f32", "f16") else "get_rows_quant.comp"
if tname == "f16":
tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
else:
tasks.append(string_to_spv(f"get_rows_{tname}", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv(f"get_rows_{tname}_f32", shader, {data_a_key: "1", "B_TYPE": "int", "D_TYPE": "float"}))
tasks.append(string_to_spv("mul_mat_vec_p021_f16_f32", "mul_mat_vec_p021.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("mul_mat_vec_nc_f16_f32", "mul_mat_vec_nc.comp", {"A_TYPE": "float16_t", "B_TYPE": "float", "D_TYPE": "float"}))
# Norms
tasks.append(string_to_spv("norm_f32", "norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rms_norm_f32", "rms_norm.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("cpy_f32_f32", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("cpy_f32_f16", "copy.comp", {"A_TYPE": "float", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("cpy_f16_f16", "copy.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t", "OPTIMIZATION_ERROR_WORKAROUND": "1"}))
tasks.append(string_to_spv("add_f32", "add.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("split_k_reduce", "mul_mat_split_k_reduce.comp", {}))
tasks.append(string_to_spv("mul_f32", "mul.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("div_f32", "div.comp", {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("scale_f32", "scale.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("sqr_f32", "square.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("clamp_f32", "clamp.comp", {"A_TYPE": "float", "D_TYPE": "float", "FLOAT_TYPE": "float"}))
tasks.append(string_to_spv("gelu_f32", "gelu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("silu_f32", "silu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("relu_f32", "relu.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("diag_mask_inf_f32", "diag_mask_inf.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("soft_max_f32", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("soft_max_f32_f16", "soft_max.comp", base_dict | {"A_TYPE": "float", "B_TYPE": "float16_t", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_norm_f32", "rope_norm.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_norm_f16", "rope_norm.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("rope_neox_f32", "rope_neox.comp", {"A_TYPE": "float", "D_TYPE": "float"}))
tasks.append(string_to_spv("rope_neox_f16", "rope_neox.comp", {"A_TYPE": "float16_t", "D_TYPE": "float16_t"}))
tasks.append(string_to_spv("argsort_f32", "argsort.comp", {"A_TYPE": "float"}))
tasks.append(string_to_spv("sum_rows_f32", "sum_rows.comp", base_dict | {"A_TYPE": "float", "D_TYPE": "float"}))
# Helper to decorate tasks with semaphore acquisition.
async def withSemaphore(sem, task):
async with sem:
return await task
# Run tasks concurrently guarded by a concurrency limit.
sem = asyncio.Semaphore(ASYNCIO_CONCURRENCY)
await asyncio.gather(*(withSemaphore(sem, task) for task in tasks))
with open("ggml-vulkan-shaders.hpp", "w") as f:
f.write("#include <cstdint>\n\n")
for name, path in sorted(shader_fnames):
with open(path, "rb") as spv:
counter = 0
newline_counter = 0
f.write(f"unsigned char {name}_data[] = {{\n")
for val in spv.read():
f.write(f"0x{val:02x},")
newline_counter += 1
counter += 1
if newline_counter >= 12:
newline_counter = 0
f.write("\n")
f.write("\n};\n")
f.write(f"const uint64_t {name}_len = {counter};\n\n")
os.remove(path)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
parser.add_argument("--glslc", help="Path to glslc")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if args.glslc:
GLSLC = args.glslc
asyncio.run(main())

View File

@ -3,20 +3,6 @@
#include "ggml.h"
#include "ggml-alloc.h"
#ifdef GGML_BACKEND_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BACKEND_BUILD
# define GGML_BACKEND_API __declspec(dllexport) extern
# else
# define GGML_BACKEND_API __declspec(dllimport) extern
# endif
# else
# define GGML_BACKEND_API __attribute__ ((visibility ("default"))) extern
# endif
#else
# define GGML_BACKEND_API extern
#endif
#ifdef __cplusplus
extern "C" {
#endif
@ -86,7 +72,7 @@ extern "C" {
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
// "offset" refers to the offset in tensor->data for setting/getting data
// "offset" refers to the offset of the tensor data for setting/getting data
GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_memset( struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size);
@ -128,12 +114,11 @@ extern "C" {
//
enum ggml_backend_dev_type {
// CPU device using system memory
GGML_BACKEND_DEVICE_TYPE_CPU,
// GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU,
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
GGML_BACKEND_DEVICE_TYPE_ACCEL
// devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
GGML_BACKEND_DEVICE_TYPE_GPU_FULL
};
// functionality supported by the device
@ -142,8 +127,6 @@ extern "C" {
bool async;
// pinned host buffer
bool host_buffer;
// creating buffers from host ptr
bool buffer_from_host_ptr;
// event synchronization
bool events;
};
@ -182,22 +165,9 @@ extern "C" {
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
// Common functions that may be obtained using ggml_backend_reg_get_proc_address
// Split buffer type for tensor parallelism
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
// Set the number of threads for the backend
typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
// Get additional buffer types provided by the device (returns a NULL-terminated array)
typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
// Set the abort callback for the backend
typedef void (*ggml_backend_set_abort_callback_t)(ggml_backend_t backend, ggml_abort_callback abort_callback, void * abort_callback_data);
// Get a list of feature flags supported by the backend (returns a NULL-terminated array)
struct ggml_backend_feature {
const char * name;
const char * value;
};
typedef struct ggml_backend_feature * (*ggml_backend_get_features_t)(ggml_backend_reg_t reg);
// Functions that may be obtained using ggml_backend_reg_get_proc_address
typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
//
// Backend registry
@ -219,16 +189,9 @@ extern "C" {
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
// = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);
// Load a backend from a dynamic library and register it
GGML_API ggml_backend_reg_t ggml_backend_load(const char * path);
// Unload a backend if loaded dynamically and unregister it
GGML_API void ggml_backend_unload(ggml_backend_reg_t reg);
// Load all known backends from dynamic libraries
GGML_API void ggml_backend_load_all(void);
//
// Backend scheduler
//
@ -257,20 +220,14 @@ extern "C" {
ggml_backend_sched_reserve(sched, reserve_graph);
// compute
graph = build_graph(sched); // the graph and its tensors are single-use in terms of allocation, multi-use in terms of computation
for (int i = 0; i < 10; ++i) {
ggml_backend_sched_graph_compute(sched, graph); // on the first iteration the graph is allocated automatically
}
graph = build_graph(sched);
ggml_backend_sched_graph_compute(sched, graph);
// if there are graph inputs:
graph = build_graph(sched); // get a new graph that is not allocated (the metadata for the old graph is freed once ggml_free is called)
ggml_backend_sched_reset(sched); // clear the allocation of the previous graph
ggml_backend_sched_alloc_graph(sched, graph); // explicitly allocate the new graph but do not execute it
ggml_backend_tensor_set(input_tensor, ...); // copy data to the newly allocated graph tensors
ggml_backend_sched_graph_compute(sched, graph); // execute the graph
// as an alternative to the above it is also possible to assign the inputs to a dedicated context and
// allocate them statically via ggml_backend_alloc_ctx_tensors
ggml_backend_sched_reset(sched);
ggml_backend_sched_alloc_graph(sched, graph);
ggml_backend_tensor_set(input_tensor, ...);
ggml_backend_sched_graph_compute(sched, graph);
}
*/
@ -285,7 +242,7 @@ extern "C" {
//
typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data);
// Initialize a backend scheduler, backends with low index are given priority over backends with high index
// Initialize a backend scheduler
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
@ -310,9 +267,7 @@ extern "C" {
GGML_API enum ggml_status ggml_backend_sched_graph_compute_async(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
GGML_API void ggml_backend_sched_synchronize(ggml_backend_sched_t sched);
// Reset all assignments and allocators - must be called before changing the node backends or allocating a new graph.
// This in effect deallocates all tensors that were previously allocated and leaves them with dangling pointers.
// The correct way to use this API is to discard the deallocated tensors and create new ones.
// Reset all assignments and allocators - must be called before changing the node backends
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
// Set a callback to be called for each resulting node during graph compute
@ -342,10 +297,27 @@ extern "C" {
GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr);
GGML_API void ggml_backend_view_init(struct ggml_tensor * tensor);
// CPU buffer types are always available
//
// CPU backend
//
GGML_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
// Create a backend buffer from an existing pointer
GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size);
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
GGML_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
#ifdef GGML_USE_CPU_HBM
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
#ifdef __cplusplus
}
#endif

View File

@ -9,15 +9,13 @@ extern "C" {
#endif
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_blas_init(void);
GGML_API ggml_backend_t ggml_backend_blas_init(void);
GGML_BACKEND_API bool ggml_backend_is_blas(ggml_backend_t backend);
GGML_API bool ggml_backend_is_blas(ggml_backend_t backend);
// number of threads used for conversion to float
// for openblas and blis, this will also set the number of threads used for blas operations
GGML_BACKEND_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_blas_reg(void);
GGML_API void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
#ifdef __cplusplus

View File

@ -34,8 +34,6 @@ extern "C" {
*/
#define GGML_CANN_MAX_DEVICES 16
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
/**
* @brief Initializes the CANN backend for a specified device.
*
@ -46,7 +44,7 @@ GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cann_reg(void);
* @param device The index of the device to initialize.
* @return A pointer to the initialized backend instance, or nullptr on failure.
*/
GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
GGML_API ggml_backend_t ggml_backend_cann_init(int32_t device);
/**
* @brief Checks if a given backend is a CANN backend.
@ -57,7 +55,7 @@ GGML_BACKEND_API ggml_backend_t ggml_backend_cann_init(int32_t device);
* @param backend The backend instance to check.
* @return True if the backend is a CANN backend, false otherwise.
*/
GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
GGML_API bool ggml_backend_is_cann(ggml_backend_t backend);
/**
* @brief Retrieves the CANN buffer type for a specified device.
@ -69,7 +67,7 @@ GGML_BACKEND_API bool ggml_backend_is_cann(ggml_backend_t backend);
* @return A pointer to the buffer type interface for the specified device, or
* nullptr if the device index is out of range.
*/
GGML_BACKEND_API ggml_backend_buffer_type_t
GGML_API ggml_backend_buffer_type_t
ggml_backend_cann_buffer_type(int32_t device);
/**
@ -80,14 +78,14 @@ ggml_backend_cann_buffer_type(int32_t device);
*
* @return The number of CANN devices available.
*/
GGML_BACKEND_API int32_t ggml_backend_cann_get_device_count(void);
GGML_API int32_t ggml_backend_cann_get_device_count(void);
/**
* @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
*
* @return A pointer to the host buffer type interface.
*/
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
/**
* @brief Retrieves the description of a specific CANN device.
@ -99,7 +97,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(v
* @param description Pointer to a buffer where the description will be written.
* @param description_size Size of the description buffer.
*/
GGML_BACKEND_API void ggml_backend_cann_get_device_description(
GGML_API void ggml_backend_cann_get_device_description(
int32_t device, char* description, size_t description_size);
/**
@ -114,7 +112,7 @@ GGML_BACKEND_API void ggml_backend_cann_get_device_description(
* @param total Pointer to a variable where the total memory size will be
* stored.
*/
GGML_BACKEND_API void ggml_backend_cann_get_device_memory(int32_t device,
GGML_API void ggml_backend_cann_get_device_memory(int32_t device,
size_t* free,
size_t* total);

View File

@ -1,38 +0,0 @@
#pragma once
#ifndef __cplusplus
#error "This header is for C++ only"
#endif
#include "ggml.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include <memory>
// Smart pointers for ggml types
// ggml
struct ggml_context_deleter { void operator()(ggml_context * ctx) { ggml_free(ctx); } };
struct gguf_context_deleter { void operator()(gguf_context * ctx) { gguf_free(ctx); } };
typedef std::unique_ptr<ggml_context, ggml_context_deleter> ggml_context_ptr;
typedef std::unique_ptr<gguf_context, gguf_context_deleter> gguf_context_ptr;
// ggml-alloc
struct ggml_gallocr_deleter { void operator()(ggml_gallocr_t galloc) { ggml_gallocr_free(galloc); } };
typedef std::unique_ptr<ggml_gallocr_t, ggml_gallocr_deleter> ggml_gallocr_ptr;
// ggml-backend
struct ggml_backend_deleter { void operator()(ggml_backend_t backend) { ggml_backend_free(backend); } };
struct ggml_backend_buffer_deleter { void operator()(ggml_backend_buffer_t buffer) { ggml_backend_buffer_free(buffer); } };
struct ggml_backend_event_deleter { void operator()(ggml_backend_event_t event) { ggml_backend_event_free(event); } };
struct ggml_backend_sched_deleter { void operator()(ggml_backend_sched_t sched) { ggml_backend_sched_free(sched); } };
typedef std::unique_ptr<ggml_backend, ggml_backend_deleter> ggml_backend_ptr;
typedef std::unique_ptr<ggml_backend_buffer, ggml_backend_buffer_deleter> ggml_backend_buffer_ptr;
typedef std::unique_ptr<ggml_backend_event, ggml_backend_event_deleter> ggml_backend_event_ptr;
typedef std::unique_ptr<ggml_backend_sched, ggml_backend_sched_deleter> ggml_backend_sched_ptr;

View File

@ -1,152 +0,0 @@
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#ifdef __cplusplus
extern "C" {
#endif
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
// numa strategies
enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0,
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
GGML_NUMA_STRATEGY_ISOLATE = 2,
GGML_NUMA_STRATEGY_NUMACTL = 3,
GGML_NUMA_STRATEGY_MIRROR = 4,
GGML_NUMA_STRATEGY_COUNT
};
GGML_BACKEND_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
GGML_BACKEND_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
GGML_BACKEND_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
GGML_BACKEND_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
GGML_BACKEND_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
GGML_BACKEND_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
GGML_BACKEND_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
GGML_BACKEND_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
GGML_BACKEND_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_BACKEND_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
GGML_BACKEND_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
GGML_BACKEND_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
GGML_BACKEND_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_BACKEND_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
GGML_BACKEND_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_BACKEND_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_BACKEND_API int ggml_threadpool_get_n_threads (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_BACKEND_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_BACKEND_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_BACKEND_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_BACKEND_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
//
// system info
//
// x86
GGML_BACKEND_API int ggml_cpu_has_sse3 (void);
GGML_BACKEND_API int ggml_cpu_has_ssse3 (void);
GGML_BACKEND_API int ggml_cpu_has_avx (void);
GGML_BACKEND_API int ggml_cpu_has_avx_vnni (void);
GGML_BACKEND_API int ggml_cpu_has_avx2 (void);
GGML_BACKEND_API int ggml_cpu_has_f16c (void);
GGML_BACKEND_API int ggml_cpu_has_fma (void);
GGML_BACKEND_API int ggml_cpu_has_avx512 (void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vbmi(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_vnni(void);
GGML_BACKEND_API int ggml_cpu_has_avx512_bf16(void);
GGML_BACKEND_API int ggml_cpu_has_amx_int8 (void);
// ARM
GGML_BACKEND_API int ggml_cpu_has_neon (void);
GGML_BACKEND_API int ggml_cpu_has_arm_fma (void);
GGML_BACKEND_API int ggml_cpu_has_fp16_va (void);
GGML_BACKEND_API int ggml_cpu_has_dotprod (void);
GGML_BACKEND_API int ggml_cpu_has_matmul_int8(void);
GGML_BACKEND_API int ggml_cpu_has_sve (void);
GGML_BACKEND_API int ggml_cpu_get_sve_cnt (void); // sve vector length in bytes
// other
GGML_BACKEND_API int ggml_cpu_has_riscv_v (void);
GGML_BACKEND_API int ggml_cpu_has_vsx (void);
GGML_BACKEND_API int ggml_cpu_has_wasm_simd (void);
GGML_BACKEND_API int ggml_cpu_has_llamafile (void);
// Internal types and functions exposed for tests and benchmarks
typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
struct ggml_type_traits_cpu {
ggml_from_float_t from_float;
ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
};
GGML_BACKEND_API const struct ggml_type_traits_cpu * ggml_get_type_traits_cpu(enum ggml_type type);
GGML_BACKEND_API void ggml_cpu_init(void);
//
// CPU backend
//
GGML_BACKEND_API ggml_backend_t ggml_backend_cpu_init(void);
GGML_BACKEND_API bool ggml_backend_is_cpu (ggml_backend_t backend);
GGML_BACKEND_API void ggml_backend_cpu_set_n_threads (ggml_backend_t backend_cpu, int n_threads);
GGML_BACKEND_API void ggml_backend_cpu_set_threadpool (ggml_backend_t backend_cpu, ggml_threadpool_t threadpool);
GGML_BACKEND_API void ggml_backend_cpu_set_abort_callback(ggml_backend_t backend_cpu, ggml_abort_callback abort_callback, void * abort_callback_data);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cpu_reg(void);
#ifdef GGML_USE_CPU_HBM
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
#endif
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cpu_aarch64_buffer_type(void);
GGML_BACKEND_API bool ggml_backend_cpu_buft_is_aarch64(ggml_backend_buffer_type_t buft);
#ifdef __cplusplus
}
#endif

View File

@ -7,7 +7,7 @@
extern "C" {
#endif
#ifdef GGML_USE_HIP
#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
#define GGML_CUBLAS_NAME "hipBLAS"
#elif defined(GGML_USE_MUSA)
@ -20,27 +20,27 @@ extern "C" {
#define GGML_CUDA_MAX_DEVICES 16
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_cuda_init(int device);
GGML_API ggml_backend_t ggml_backend_cuda_init(int device);
GGML_BACKEND_API bool ggml_backend_is_cuda(ggml_backend_t backend);
GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend);
// device buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
GGML_BACKEND_API int ggml_backend_cuda_get_device_count(void);
GGML_BACKEND_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_BACKEND_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_API int ggml_backend_cuda_get_device_count(void);
GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total);
GGML_BACKEND_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_BACKEND_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_API bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size);
GGML_API void ggml_backend_cuda_unregister_host_buffer(void * buffer);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
GGML_API ggml_backend_reg_t ggml_backend_cuda_reg(void);
#ifdef __cplusplus
}

View File

@ -11,8 +11,6 @@
extern "C" {
#endif
#define GGML_KOMPUTE_MAX_DEVICES 16
struct ggml_vk_device {
int index;
int type; // same as VkPhysicalDeviceType
@ -37,13 +35,11 @@ struct ggml_vk_device ggml_vk_current_device(void);
// forward declaration
typedef struct ggml_backend * ggml_backend_t;
GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device);
GGML_API ggml_backend_t ggml_backend_kompute_init(int device);
GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend);
GGML_API bool ggml_backend_is_kompute(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device);
#ifdef __cplusplus
}

View File

@ -39,27 +39,23 @@ extern "C" {
// user-code should use only these functions
//
GGML_BACKEND_API ggml_backend_t ggml_backend_metal_init(void);
GGML_API ggml_backend_t ggml_backend_metal_init(void);
GGML_BACKEND_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
GGML_DEPRECATED(
GGML_BACKEND_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size),
"obsoleted by the new device interface - https://github.com/ggerganov/llama.cpp/pull/9713");
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
GGML_BACKEND_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
GGML_API void ggml_backend_metal_set_abort_callback(ggml_backend_t backend, ggml_abort_callback abort_callback, void * user_data);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
// helper to check if the device supports a specific family
// ideally, the user code should be doing these checks
// ref: https://developer.apple.com/metal/Metal-Feature-Set-Tables.pdf
GGML_BACKEND_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
GGML_API bool ggml_backend_metal_supports_family(ggml_backend_t backend, int family);
// capture all command buffers committed the next time `ggml_backend_graph_compute` is called
GGML_BACKEND_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_metal_reg(void);
GGML_API void ggml_backend_metal_capture_next_compute(ggml_backend_t backend);
#ifdef __cplusplus
}

View File

@ -1,216 +0,0 @@
// This file contains functionality for training models using GGML.
// It is not strictly needed vs. just vanilla GGML but it provides a more high-level interface for common needs such as datasets.
// At the bottom of this file especially there are relatively high-level functions that are suitable use or adaptation in user code.
//
// Module maintainer: Johannes Gäßler (@JohannesGaessler, johannesg@5d6.de)
#pragma once
#include "ggml.h"
#include "ggml-backend.h"
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
struct ggml_opt_dataset;
struct ggml_opt_context;
struct ggml_opt_result;
typedef struct ggml_opt_dataset * ggml_opt_dataset_t;
typedef struct ggml_opt_context * ggml_opt_context_t;
typedef struct ggml_opt_result * ggml_opt_result_t;
// ====== Loss ======
// built-in loss types, i.e. the built-in quantities minimized by the optimizer
// custom loss types can be defined via mean or sum which simply reduce the outputs for all datapoints to a single value
enum ggml_opt_loss_type {
GGML_OPT_LOSS_TYPE_MEAN,
GGML_OPT_LOSS_TYPE_SUM,
GGML_OPT_LOSS_TYPE_CROSS_ENTROPY,
GGML_OPT_LOSS_TYPE_MEAN_SQUARED_ERROR,
};
// ====== Dataset ======
GGML_API ggml_opt_dataset_t ggml_opt_dataset_init(
int64_t ne_datapoint, // number of elements per datapoint
int64_t ne_label, // number of elements per label
int64_t ndata, // total number of datapoints/labels
int64_t ndata_shard); // number of datapoints/labels per shard (unit at which the dataset is shuffled/copied)
GGML_API void ggml_opt_dataset_free(ggml_opt_dataset_t dataset);
// get underlying tensors that store the data
GGML_API struct ggml_tensor * ggml_opt_dataset_data (ggml_opt_dataset_t dataset); // shape = [ne_datapoint, ndata]
GGML_API struct ggml_tensor * ggml_opt_dataset_labels(ggml_opt_dataset_t dataset); // shape = [nd_label, ndata]
// shuffle idata first datapoints from dataset with RNG from opt_ctx, shuffle all datapoints if idata is negative
GGML_API void ggml_opt_dataset_shuffle(ggml_opt_context_t opt_ctx, ggml_opt_dataset_t dataset, int64_t idata);
// get batch at position ibatch from dataset and copy the data to data_batch and labels_batch
GGML_API void ggml_opt_dataset_get_batch(
ggml_opt_dataset_t dataset,
struct ggml_tensor * data_batch, // shape = [ne_datapoint, ndata_batch]
struct ggml_tensor * labels_batch, // shape = [ne_label, ndata_batch]
int64_t ibatch);
// ====== Model / Context ======
enum ggml_opt_build_type {
GGML_OPT_BUILD_TYPE_FORWARD,
GGML_OPT_BUILD_TYPE_GRAD,
GGML_OPT_BUILD_TYPE_OPT,
};
// parameters that control which optimizer is used and how said optimizer tries to find the minimal loss
struct ggml_opt_optimizer_params {
// AdamW optimizer parameters
struct {
float alpha; // learning rate
float beta1;
float beta2;
float eps; // epsilon for numerical stability
float wd; // weight decay for AdamW, use 0.0f to disable
} adamw;
};
// callback to calculate optimizer parameters prior to a backward pass
// userdata can be used to pass arbitrary data
typedef struct ggml_opt_optimizer_params (*ggml_opt_get_optimizer_params)(void * userdata);
// returns the default optimizer params (constant)
// userdata is not used
GGML_API struct ggml_opt_optimizer_params ggml_opt_get_default_optimizer_params(void * userdata);
// parameters for initializing a new optimization context
struct ggml_opt_params {
ggml_backend_sched_t backend_sched; // defines which backends are used to construct the compute graphs
struct ggml_context * ctx_compute; // created in user code, holds non-static tensors
// the forward graph is defined by inputs and outputs
// those tensors and all tensors inbetween are not intended to be reusable between multiple optimization contexts
struct ggml_tensor * inputs;
struct ggml_tensor * outputs;
enum ggml_opt_loss_type loss_type;
enum ggml_opt_build_type build_type;
int32_t opt_period; // after how many gradient accumulation steps an optimizer step should be done
ggml_opt_get_optimizer_params get_opt_pars; // callback for calculating optimizer parameters
void * get_opt_pars_ud; // userdata for calculating optimizer parameters
};
// get parameters for an optimization context with defaults set where possible
// parameters for which no sensible defaults exist are supplied as arguments to this function
GGML_API ggml_opt_params ggml_opt_default_params(
ggml_backend_sched_t backend_sched,
struct ggml_context * ctx_compute,
struct ggml_tensor * inputs,
struct ggml_tensor * outputs,
enum ggml_opt_loss_type loss_type);
GGML_API ggml_opt_context_t ggml_opt_init(struct ggml_opt_params params);
GGML_API void ggml_opt_free(ggml_opt_context_t opt_ctx);
// set gradients to zero, initilize loss, and optionally reset the optimizer
GGML_API void ggml_opt_reset(ggml_opt_context_t opt_ctx, bool optimizer);
// get underlying tensors that store data
GGML_API struct ggml_tensor * ggml_opt_inputs( ggml_opt_context_t opt_ctx); // forward graph input tensor
GGML_API struct ggml_tensor * ggml_opt_outputs( ggml_opt_context_t opt_ctx); // forward graph output tensor
GGML_API struct ggml_tensor * ggml_opt_labels( ggml_opt_context_t opt_ctx); // labels to compare outputs against
GGML_API struct ggml_tensor * ggml_opt_loss( ggml_opt_context_t opt_ctx); // scalar tensor that contains the loss
GGML_API struct ggml_tensor * ggml_opt_pred( ggml_opt_context_t opt_ctx); // predictions made by outputs
GGML_API struct ggml_tensor * ggml_opt_ncorrect(ggml_opt_context_t opt_ctx); // number of matching predictions between outputs and labels
GGML_API struct ggml_tensor * ggml_opt_grad_acc(ggml_opt_context_t opt_ctx, struct ggml_tensor * node);
// ====== Optimization Result ======
GGML_API ggml_opt_result_t ggml_opt_result_init();
GGML_API void ggml_opt_result_free(ggml_opt_result_t result);
GGML_API void ggml_opt_result_reset(ggml_opt_result_t result);
// get data from result, uncertainties are optional and can be ignored by passing NULL
GGML_API void ggml_opt_result_ndata( ggml_opt_result_t result, int64_t * ndata); // writes 1 value, number of datapoints
GGML_API void ggml_opt_result_loss( ggml_opt_result_t result, double * loss, double * unc); // writes 1 value
GGML_API void ggml_opt_result_pred( ggml_opt_result_t result, int32_t * pred); // writes ndata values
GGML_API void ggml_opt_result_accuracy(ggml_opt_result_t result, double * accuracy, double * unc); // writes 1 value
// ====== Computation ======
// do forward pass, increment result if not NULL
GGML_API void ggml_opt_forward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// do forward pass, increment result if not NULL, do backward pass
GGML_API void ggml_opt_forward_backward(ggml_opt_context_t opt_ctx, ggml_opt_result_t result);
// ############################################################################
// ## The high-level functions start here. They do not depend on any private ##
// ## functions or structs and can be copied to and adapted for user code. ##
// ############################################################################
// ====== Intended Usage ======
//
// 1. Select the appropriate loss for your problem.
// 2. Create a dataset and set the data for the "data" tensor. Also set the "labels" tensor if your loss needs them.
// Setting the shard size to 1 will be fine, it's the granularity with which data is shuffled/loaded (bigger values are faster).
// 3. Create a GGML graph for your model with no_alloc == true. Use two separate contexts for the tensors.
// The first context should contain the model parameters and inputs and be allocated statically in user code.
// The second context should contain all other tensors and will be (re)allocated automatically.
// Due to this automated allocation the data of the second context is not defined when accessed in user code.
// Note that the second dimension of the inputs/outputs are interpreted as the number of datapoints in those tensors.
// 4. Call ggml_opt_fit. If you need more control you can use ggml_opt_epoch instead.
// signature for a callback while evaluating opt_ctx on dataset, called after an evaluation
typedef void (*ggml_opt_epoch_callback)(
bool train, // true after training evaluation, false after validation evaluation
ggml_opt_context_t opt_ctx,
ggml_opt_dataset_t dataset,
ggml_opt_result_t result, // result associated with the dataset subsection
int64_t ibatch, // number of batches that have been evaluated so far
int64_t ibatch_max, // total number of batches in this dataset subsection
int64_t t_start_us); // time at which the evaluation on the dataset subsection was started
// do training on front of dataset, do evaluation only on back of dataset
GGML_API void ggml_opt_epoch(
ggml_opt_context_t opt_ctx,
ggml_opt_dataset_t dataset,
ggml_opt_result_t result_train, // result to increment during training, ignored if NULL
ggml_opt_result_t result_eval, // result to increment during evaluation, ignored if NULL
int64_t idata_split, // data index at which to split training and evaluation
ggml_opt_epoch_callback callback_train,
ggml_opt_epoch_callback callback_eval);
// callback that prints a progress bar on stderr
GGML_API void ggml_opt_epoch_callback_progress_bar(
bool train,
ggml_opt_context_t opt_ctx,
ggml_opt_dataset_t dataset,
ggml_opt_result_t result,
int64_t ibatch,
int64_t ibatch_max,
int64_t t_start_us);
// fit model defined by inputs and outputs to dataset
GGML_API void ggml_opt_fit(
ggml_backend_sched_t backend_sched, // backend scheduler for constructing the compute graphs
ggml_context * ctx_compute, // context with temporarily allocated tensors to calculate the outputs
ggml_tensor * inputs, // input tensor with shape [ne_datapoint, ndata_batch]
ggml_tensor * outputs, // output tensor, must have shape [ne_label, ndata_batch] if labels are used
ggml_opt_dataset_t dataset, // dataset with data and optionally also labels
enum ggml_opt_loss_type loss_type, // loss to minimize
ggml_opt_get_optimizer_params get_opt_pars, // callback to get optimizer params, userdata is pointer to epoch (of type int64_t)
int64_t nepoch, // how many times the dataset should be iterated over
int64_t nbatch_logical, // datapoints optimizer step, must be a multiple of ndata_batch in inputs/outputs
float val_split, // fraction of the dataset to use for validation, must be in [0.0f, 1.0f)
bool silent); // whether or not info prints to stderr should be suppressed
#ifdef __cplusplus
}
#endif

View File

@ -10,18 +10,14 @@ extern "C" {
#define GGML_RPC_MAX_SERVERS 16
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_BACKEND_API bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_rpc_init(const char * endpoint);
GGML_API bool ggml_backend_is_rpc(ggml_backend_t backend);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const char * endpoint);
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
GGML_BACKEND_API ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
GGML_API void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
#ifdef __cplusplus
}

View File

@ -17,33 +17,26 @@ extern "C" {
#endif
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_sycl_init(int device);
GGML_BACKEND_API bool ggml_backend_is_sycl(ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sycl_init(int device);
// devide buffer
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void);
GGML_BACKEND_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_BACKEND_API void ggml_backend_sycl_get_gpu_list(int *id_list, int max_len);
GGML_BACKEND_API void ggml_backend_sycl_get_device_description(int device,
char *description,
size_t description_size);
GGML_BACKEND_API int ggml_backend_sycl_get_device_count();
GGML_BACKEND_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
GGML_API void ggml_backend_sycl_print_sycl_devices(void);
GGML_API void ggml_sycl_get_gpu_list(int *id_list, int max_len);
GGML_API void ggml_sycl_get_device_description(int device, char *description, size_t description_size);
GGML_API int ggml_backend_sycl_get_device_count();
GGML_API void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total);
// SYCL doesn't support registering host memory, keep here for reference
// GGML_BACKEND_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_BACKEND_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_sycl_reg(void);
// GGML_API bool ggml_backend_sycl_register_host_buffer(void * buffer, size_t size);
// GGML_API void ggml_backend_sycl_unregister_host_buffer(void * buffer);
#ifdef __cplusplus
}
#endif

View File

@ -10,21 +10,19 @@ extern "C" {
#define GGML_VK_NAME "Vulkan"
#define GGML_VK_MAX_DEVICES 16
GGML_BACKEND_API void ggml_vk_instance_init(void);
GGML_API void ggml_vk_instance_init(void);
// backend API
GGML_BACKEND_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_API ggml_backend_t ggml_backend_vk_init(size_t dev_num);
GGML_BACKEND_API bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_BACKEND_API int ggml_backend_vk_get_device_count(void);
GGML_BACKEND_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_BACKEND_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_API bool ggml_backend_is_vk(ggml_backend_t backend);
GGML_API int ggml_backend_vk_get_device_count(void);
GGML_API void ggml_backend_vk_get_device_description(int device, char * description, size_t description_size);
GGML_API void ggml_backend_vk_get_device_memory(int device, size_t * free, size_t * total);
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_buffer_type(size_t dev_num);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_vk_reg(void);
GGML_API ggml_backend_buffer_type_t ggml_backend_vk_host_buffer_type(void);
#ifdef __cplusplus
}

View File

@ -176,15 +176,15 @@
#ifdef GGML_SHARED
# if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BUILD
# define GGML_API __declspec(dllexport) extern
# define GGML_API __declspec(dllexport)
# else
# define GGML_API __declspec(dllimport) extern
# define GGML_API __declspec(dllimport)
# endif
# else
# define GGML_API __attribute__ ((visibility ("default"))) extern
# define GGML_API __attribute__ ((visibility ("default")))
# endif
#else
# define GGML_API extern
# define GGML_API
#endif
// TODO: support for clang
@ -389,9 +389,6 @@ extern "C" {
GGML_TYPE_Q4_0_8_8 = 33,
GGML_TYPE_TQ1_0 = 34,
GGML_TYPE_TQ2_0 = 35,
GGML_TYPE_IQ4_NL_4_4 = 36,
// GGML_TYPE_IQ4_NL_4_8 = 37,
// GGML_TYPE_IQ4_NL_8_8 = 38,
GGML_TYPE_COUNT,
};
@ -499,7 +496,6 @@ extern "C" {
GGML_OP_POOL_2D_BACK,
GGML_OP_UPSCALE, // nearest interpolate
GGML_OP_PAD,
GGML_OP_PAD_REFLECT_1D,
GGML_OP_ARANGE,
GGML_OP_TIMESTEP_EMBEDDING,
GGML_OP_ARGSORT,
@ -513,7 +509,7 @@ extern "C" {
GGML_OP_WIN_UNPART,
GGML_OP_GET_REL_POS,
GGML_OP_ADD_REL_POS,
GGML_OP_RWKV_WKV6,
GGML_OP_RWKV_WKV,
GGML_OP_UNARY,
@ -562,10 +558,10 @@ extern "C" {
enum ggml_log_level {
GGML_LOG_LEVEL_NONE = 0,
GGML_LOG_LEVEL_DEBUG = 1,
GGML_LOG_LEVEL_INFO = 2,
GGML_LOG_LEVEL_WARN = 3,
GGML_LOG_LEVEL_ERROR = 4,
GGML_LOG_LEVEL_INFO = 1,
GGML_LOG_LEVEL_WARN = 2,
GGML_LOG_LEVEL_ERROR = 3,
GGML_LOG_LEVEL_DEBUG = 4,
GGML_LOG_LEVEL_CONT = 5, // continue previous log
};
@ -577,13 +573,6 @@ extern "C" {
GGML_TENSOR_FLAG_LOSS = 8, // ...defines loss for numerical optimization (multiple loss tensors add up)
};
struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
};
// n-dimensional tensor
struct ggml_tensor {
enum ggml_type type;
@ -606,6 +595,7 @@ extern "C" {
int32_t flags;
struct ggml_tensor * grad;
struct ggml_tensor * src[GGML_MAX_SRC];
// source tensor and offset for views
@ -618,7 +608,7 @@ extern "C" {
void * extra; // extra things e.g. for ggml-cuda.cu
char padding[8];
// char padding[4];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
@ -628,6 +618,67 @@ extern "C" {
// If it returns true, the computation is aborted
typedef bool (*ggml_abort_callback)(void * data);
// Scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// Threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
// the compute plan that needs to be prepared for ggml_graph_compute()
// since https://github.com/ggerganov/ggml/issues/287
struct ggml_cplan {
size_t work_size; // size of work buffer, calculated by `ggml_graph_plan()`
uint8_t * work_data; // work buffer, to be allocated by caller before calling to `ggml_graph_compute()`
int n_threads;
struct ggml_threadpool * threadpool;
// abort ggml_graph_compute when true
ggml_abort_callback abort_callback;
void * abort_callback_data;
};
// scratch buffer
// TODO: deprecate and remove
struct ggml_scratch {
size_t offs;
size_t size;
void * data;
};
struct ggml_init_params {
// memory pool
size_t mem_size; // bytes
void * mem_buffer; // if NULL, memory will be allocated internally
bool no_alloc; // don't allocate memory for the tensor data
};
// numa strategies
enum ggml_numa_strategy {
GGML_NUMA_STRATEGY_DISABLED = 0,
GGML_NUMA_STRATEGY_DISTRIBUTE = 1,
GGML_NUMA_STRATEGY_ISOLATE = 2,
GGML_NUMA_STRATEGY_NUMACTL = 3,
GGML_NUMA_STRATEGY_MIRROR = 4,
GGML_NUMA_STRATEGY_COUNT
};
//
// GUID
@ -650,6 +701,9 @@ extern "C" {
// accepts a UTF-8 path, even on Windows
GGML_API FILE * ggml_fopen(const char * fname, const char * mode);
GGML_API void ggml_numa_init(enum ggml_numa_strategy numa); // call once for better performance on NUMA systems
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
GGML_API void ggml_print_object (const struct ggml_object * obj);
GGML_API void ggml_print_objects(const struct ggml_context * ctx);
@ -712,6 +766,7 @@ extern "C" {
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API bool ggml_get_no_alloc(struct ggml_context * ctx);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
@ -751,7 +806,8 @@ extern "C" {
int64_t ne2,
int64_t ne3);
GGML_API void * ggml_new_buffer(struct ggml_context * ctx, size_t nbytes);
GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
@ -761,25 +817,35 @@ extern "C" {
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
// Converts a flat index into coordinates
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
// Converts a flat index into coordinates
GGML_API void ggml_unravel_index(const struct ggml_tensor * tensor, int64_t i, int64_t * i0, int64_t * i1, int64_t * i2, int64_t * i3);
GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
GGML_API int32_t ggml_get_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_API void ggml_set_i32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, int32_t value);
GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
GGML_API float ggml_get_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3);
GGML_API void ggml_set_f32_nd(const struct ggml_tensor * tensor, int i0, int i1, int i2, int i3, float value);
GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
GGML_API enum ggml_unary_op ggml_get_unary_op(const struct ggml_tensor * tensor);
GGML_API const char * ggml_get_name (const struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_name ( struct ggml_tensor * tensor, const char * name);
GGML_ATTRIBUTE_FORMAT(2, 3)
GGML_API struct ggml_tensor * ggml_format_name( struct ggml_tensor * tensor, const char * fmt, ...);
// Tensor flags
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
//
// operations on tensors with backpropagation
//
@ -1493,7 +1559,7 @@ extern "C" {
"use ggml_rope_ext_inplace instead");
// compute correction dims for YaRN RoPE scaling
GGML_API void ggml_rope_yarn_corr_dims(
void ggml_rope_yarn_corr_dims(
int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, float dims[2]);
// rotary position embedding backward, i.e compute dx from dy
@ -1696,13 +1762,6 @@ extern "C" {
int p2,
int p3);
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
GGML_API struct ggml_tensor * ggml_pad_reflect_1d(
struct ggml_context * ctx,
struct ggml_tensor * a,
int p0,
int p1);
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
// timesteps: [N,]
// return: [N, dim]
@ -1756,9 +1815,6 @@ extern "C" {
struct ggml_tensor * a,
enum ggml_prec prec);
GGML_API enum ggml_prec ggml_flash_attn_ext_get_prec(
const struct ggml_tensor * a);
// TODO: needs to be adapted to ggml_flash_attn_ext
GGML_API struct ggml_tensor * ggml_flash_attn_back(
struct ggml_context * ctx,
@ -1832,7 +1888,7 @@ extern "C" {
struct ggml_tensor * pw,
struct ggml_tensor * ph);
GGML_API struct ggml_tensor * ggml_rwkv_wkv6(
GGML_API struct ggml_tensor * ggml_rwkv_wkv(
struct ggml_context * ctx,
struct ggml_tensor * k,
struct ggml_tensor * v,
@ -1995,20 +2051,31 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * grad,
struct ggml_tensor * m,
struct ggml_tensor * v,
struct ggml_tensor * adamw_params); // parameters such a the learning rate
float alpha,
float beta1,
float beta2,
float eps,
float wd); // weight decay
//
// automatic differentiation
//
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(
struct ggml_context * ctx_static, // context for static gradients (loss + gradient accumulation)
struct ggml_context * ctx_compute, // context for gradient computation
struct ggml_cgraph * cgraph,
bool accumulate); // whether or not gradients should be accumulated, requires static allocation of tensors in ctx_static
GGML_API void ggml_set_param(struct ggml_context * ctx, struct ggml_tensor * tensor);
GGML_API void ggml_set_loss(struct ggml_tensor * tensor);
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool accumulate);
GGML_API void ggml_build_opt_adamw(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
float alpha,
float beta1,
float beta2,
float eps,
float wd); // weight decay
// graph allocation in a context
GGML_API struct ggml_cgraph * ggml_new_graph (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
@ -2028,9 +2095,28 @@ extern "C" {
GGML_API size_t ggml_graph_overhead(void);
GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
GGML_API struct ggml_tensor * ggml_graph_get_tensor (const struct ggml_cgraph * cgraph, const char * name);
GGML_API struct ggml_tensor * ggml_graph_get_grad (const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
GGML_API struct ggml_tensor * ggml_graph_get_grad_acc(const struct ggml_cgraph * cgraph, const struct ggml_tensor * node);
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API struct ggml_threadpool * ggml_threadpool_new (struct ggml_threadpool_params * params);
GGML_API void ggml_threadpool_free (struct ggml_threadpool * threadpool);
GGML_API int ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_pause (struct ggml_threadpool * threadpool);
GGML_API void ggml_threadpool_resume (struct ggml_threadpool * threadpool);
// ggml_graph_plan() has to be called before ggml_graph_compute()
// when plan.work_size > 0, caller must allocate memory for plan.work_data
GGML_API struct ggml_cplan ggml_graph_plan(
const struct ggml_cgraph * cgraph,
int n_threads, /* = GGML_DEFAULT_N_THREADS */
struct ggml_threadpool * threadpool /* = NULL */ );
GGML_API enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan);
// same as ggml_graph_compute() but the work data is allocated as a part of the context
// note: the drawback of this API is that you must have ensured that the context has enough memory for the work data
GGML_API enum ggml_status ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads);
GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
GGML_API struct ggml_cgraph * ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
@ -2041,14 +2127,201 @@ extern "C" {
// dump the graph into a file using the dot format
GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
// TODO these functions were sandwiched in the old optimization interface, is there a better place for them?
// build gradient checkpointing backward graph gb for gf using provided checkpoints
// gb_tmp will contain original backward graph with rewritten backward process nodes,
// but without the second forward pass nodes.
GGML_API void ggml_build_backward_gradient_checkpointing(
struct ggml_context * ctx,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
struct ggml_cgraph * gb_tmp,
struct ggml_tensor * * checkpoints,
int n_checkpoints);
//
// optimization
//
// optimization methods
enum ggml_opt_type {
GGML_OPT_TYPE_ADAM,
GGML_OPT_TYPE_LBFGS,
};
// linesearch methods
enum ggml_linesearch {
GGML_LINESEARCH_DEFAULT = 1,
GGML_LINESEARCH_BACKTRACKING_ARMIJO = 0,
GGML_LINESEARCH_BACKTRACKING_WOLFE = 1,
GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE = 2,
};
// optimization return values
enum ggml_opt_result {
GGML_OPT_RESULT_OK = 0,
GGML_OPT_RESULT_DID_NOT_CONVERGE,
GGML_OPT_RESULT_NO_CONTEXT,
GGML_OPT_RESULT_INVALID_WOLFE,
GGML_OPT_RESULT_FAIL,
GGML_OPT_RESULT_CANCEL,
GGML_LINESEARCH_FAIL = -128,
GGML_LINESEARCH_MINIMUM_STEP,
GGML_LINESEARCH_MAXIMUM_STEP,
GGML_LINESEARCH_MAXIMUM_ITERATIONS,
GGML_LINESEARCH_INVALID_PARAMETERS,
};
typedef void (*ggml_opt_callback)(void * data, int accum_step, float * sched, bool * cancel);
typedef void (*ggml_log_callback)(enum ggml_log_level level, const char * text, void * user_data);
// Set callback for all future logging events.
// If this is not called, or NULL is supplied, everything is output on stderr.
GGML_API void ggml_log_set(ggml_log_callback log_callback, void * user_data);
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
// optimization parameters
//
// see ggml.c (ggml_opt_default_params) for default values
//
struct ggml_opt_params {
enum ggml_opt_type type;
size_t graph_size;
int n_threads;
// delta-based convergence test
//
// if past == 0 - disabled
// if past > 0:
// stop if |f(x) - f(x_past)| < delta * max(1, |f(x)|)
//
int past;
float delta;
// maximum number of iterations without improvement
//
// if 0 - disabled
// if > 0:
// assume convergence if no cost improvement in this number of iterations
//
int max_no_improvement;
bool print_forward_graph;
bool print_backward_graph;
int n_gradient_accumulation;
// ADAM parameters
struct {
int n_iter;
float sched; // schedule multiplier (fixed, decay or warmup)
float decay; // weight decay for AdamW, use 0.0f to disable
int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
float alpha; // learning rate
float beta1;
float beta2;
float eps; // epsilon for numerical stability
float eps_f; // epsilon for convergence test
float eps_g; // epsilon for convergence test
float gclip; // gradient clipping
} adam;
// LBFGS parameters
struct {
int m; // number of corrections to approximate the inv. Hessian
int n_iter;
int max_linesearch;
float eps; // convergence tolerance
float ftol; // line search tolerance
float wolfe;
float min_step;
float max_step;
enum ggml_linesearch linesearch;
} lbfgs;
};
struct ggml_opt_context {
struct ggml_context * ctx;
struct ggml_opt_params params;
int iter;
int64_t nx; // number of parameter elements
bool just_initialized;
float loss_before;
float loss_after;
struct {
struct ggml_tensor * g; // current gradient
struct ggml_tensor * m; // first moment
struct ggml_tensor * v; // second moment
struct ggml_tensor * pf; // past function values
float fx_best;
float fx_prev;
int n_no_improvement;
} adam;
struct {
struct ggml_tensor * x; // current parameters
struct ggml_tensor * xp; // previous parameters
struct ggml_tensor * g; // current gradient
struct ggml_tensor * gp; // previous gradient
struct ggml_tensor * d; // search direction
struct ggml_tensor * pf; // past function values
struct ggml_tensor * lmal; // the L-BFGS memory alpha
struct ggml_tensor * lmys; // the L-BFGS memory ys
struct ggml_tensor * lms; // the L-BFGS memory s
struct ggml_tensor * lmy; // the L-BFGS memory y
float fx_best;
float step;
int j;
int k;
int end;
int n_no_improvement;
} lbfgs;
};
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
// optimize the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt(
struct ggml_context * ctx,
struct ggml_opt_params params,
struct ggml_tensor * f);
// initialize optimizer context
GGML_API void ggml_opt_init(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_opt_params params,
int64_t nx);
// continue optimizing the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt_resume(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_tensor * f);
// continue optimizing the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt_resume_g(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_tensor * f,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb,
ggml_opt_callback callback,
void * callback_data);
//
// tensor flags
//
GGML_API void ggml_set_input(struct ggml_tensor * tensor);
GGML_API void ggml_set_output(struct ggml_tensor * tensor);
//
// quantization
@ -2205,6 +2478,47 @@ extern "C" {
GGML_API size_t gguf_get_meta_size(const struct gguf_context * ctx);
GGML_API void gguf_get_meta_data(const struct gguf_context * ctx, void * data);
//
// system info
//
GGML_API int ggml_cpu_has_avx (void);
GGML_API int ggml_cpu_has_avx_vnni (void);
GGML_API int ggml_cpu_has_avx2 (void);
GGML_API int ggml_cpu_has_avx512 (void);
GGML_API int ggml_cpu_has_avx512_vbmi(void);
GGML_API int ggml_cpu_has_avx512_vnni(void);
GGML_API int ggml_cpu_has_avx512_bf16(void);
GGML_API int ggml_cpu_has_fma (void);
GGML_API int ggml_cpu_has_neon (void);
GGML_API int ggml_cpu_has_sve (void);
GGML_API int ggml_cpu_has_arm_fma (void);
GGML_API int ggml_cpu_has_metal (void);
GGML_API int ggml_cpu_has_f16c (void);
GGML_API int ggml_cpu_has_fp16_va (void);
GGML_API int ggml_cpu_has_wasm_simd (void);
GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cuda (void);
GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_kompute (void);
GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_riscv_v (void);
GGML_API int ggml_cpu_has_sycl (void);
GGML_API int ggml_cpu_has_rpc (void);
GGML_API int ggml_cpu_has_vsx (void);
GGML_API int ggml_cpu_has_matmul_int8(void);
GGML_API int ggml_cpu_has_cann (void);
GGML_API int ggml_cpu_has_llamafile (void);
// get the sve vector length in bytes
GGML_API int ggml_cpu_get_sve_cnt(void);
//
// Internal types and functions exposed for tests and benchmarks
//
#ifdef __cplusplus
// restrict not standard in C++
#define GGML_RESTRICT
@ -2213,49 +2527,34 @@ extern "C" {
#endif
typedef void (*ggml_to_float_t) (const void * GGML_RESTRICT x, float * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_t)(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
typedef void (*ggml_from_float_to_mat_t)
(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nr, int64_t k, int64_t bs);
typedef void (*ggml_vec_dot_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x, size_t bx,
const void * GGML_RESTRICT y, size_t by, int nrc);
typedef void (*ggml_gemv_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
typedef void (*ggml_gemm_t) (int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT x,
const void * GGML_RESTRICT y, int nr, int nc);
struct ggml_type_traits {
typedef struct {
const char * type_name;
int64_t blck_size;
int64_t blck_size_interleave; // interleave elements in blocks
size_t type_size;
bool is_quantized;
ggml_to_float_t to_float;
ggml_from_float_t from_float;
ggml_from_float_t from_float_ref;
};
ggml_from_float_to_mat_t from_float_to_mat;
ggml_vec_dot_t vec_dot;
enum ggml_type vec_dot_type;
int64_t nrows; // number of rows to process simultaneously
int64_t ncols; // number of columns to process simultaneously
ggml_gemv_t gemv;
ggml_gemm_t gemm;
} ggml_type_traits_t;
GGML_API const struct ggml_type_traits * ggml_get_type_traits(enum ggml_type type);
// ggml threadpool
// TODO: currently, only a few functions are in the base ggml API, while the rest are in the CPU backend
// the goal should be to create an API that other backends can use move everything to the ggml base
// scheduling priorities
enum ggml_sched_priority {
GGML_SCHED_PRIO_NORMAL,
GGML_SCHED_PRIO_MEDIUM,
GGML_SCHED_PRIO_HIGH,
GGML_SCHED_PRIO_REALTIME
};
// threadpool params
// Use ggml_threadpool_params_default() or ggml_threadpool_params_init() to populate the defaults
struct ggml_threadpool_params {
bool cpumask[GGML_MAX_N_THREADS]; // mask of cpu cores (all-zeros means use default affinity settings)
int n_threads; // number of threads
enum ggml_sched_priority prio; // thread priority
uint32_t poll; // polling level (0 - no polling, 100 - aggressive polling)
bool strict_cpu; // strict cpu placement
bool paused; // start in paused state
};
struct ggml_threadpool; // forward declaration, see ggml.c
typedef struct ggml_threadpool * ggml_threadpool_t;
GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
GGML_API void ggml_threadpool_params_init (struct ggml_threadpool_params * p, int n_threads);
GGML_API bool ggml_threadpool_params_match (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
GGML_API ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type);
#ifdef __cplusplus
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,9 @@
// SPDX-FileCopyrightText: Copyright 2024 Arm Ltd.
#pragma once
#define GGML_COMMON_DECL_C
#include "ggml-common.h"
#include "ggml.h"
// GGML internal header
@ -8,11 +12,27 @@
extern "C" {
#endif
// Quantization
void quantize_q8_0_4x4(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_q8_0_4x8(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t k);
void quantize_mat_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int64_t nrows, int64_t n_per_row, int64_t blck_size_interleave);
// Quantization utilizing an importance matrix (a.k.a. "Activation aWare Quantization")
size_t quantize_q4_0_4x4(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_4x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
size_t quantize_q4_0_8x8(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int64_t nrows, int64_t n_per_row, const float * imatrix);
// GEMV
void ggml_gemv_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemv_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
// GEMM
void ggml_gemm_q4_0_4x4_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_4x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
void ggml_gemm_q4_0_8x8_q8_0(int n, float * GGML_RESTRICT s, size_t bs, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy, int nr, int nc);
#ifdef __cplusplus
}
#endif

View File

@ -14,7 +14,7 @@
//#define GGML_ALLOCATOR_DEBUG
//#define AT_PRINTF(...) GGML_LOG_DEBUG(__VA_ARGS__)
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
#define AT_PRINTF(...)
@ -89,7 +89,7 @@ void ggml_tallocr_alloc(struct ggml_tallocr * talloc, struct ggml_tensor * tenso
size = GGML_PAD(size, talloc->alignment);
if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
__func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
GGML_ABORT("not enough space in the buffer");
}
@ -172,7 +172,7 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
best_fit_block = alloc->n_free_blocks - 1;
} else {
// this should never happen
GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
__func__, size, max_avail);
GGML_ABORT("not enough space in the buffer");
}
@ -209,16 +209,16 @@ static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t siz
}
}
}
GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
for (int i = 0; i < 1024; i++) {
if (alloc->allocated_tensors[i].tensor) {
GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
alloc->allocated_tensors[i].offset,
alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
}
}
GGML_LOG_DEBUG("\n");
fprintf(stderr, "\n");
}
#endif
@ -348,6 +348,7 @@ struct tensor_alloc {
};
struct leaf_alloc {
int buffer_id;
struct tensor_alloc leaf;
};
@ -466,12 +467,18 @@ static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
return ggml_gallocr_hash_get(galloc, t)->allocated;
}
static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
hn->buffer_id = buffer_id;
hn->offset = offset;
hn->allocated = true;
}
static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
}
static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
GGML_ASSERT(buffer_id >= 0);
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
@ -733,6 +740,7 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
if (leaf->view_src || leaf->data) {
galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
@ -760,13 +768,13 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
if (new_size > cur_size || galloc->buffers[i] == NULL) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) {
GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
return false;
}
ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@ -810,25 +818,21 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
}
static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
size_t node_size = 0;
if (!node->data && !node->view_src) {
GGML_ASSERT(talloc->buffer_id >= 0); // prevent segfault when misusing the API
node_size = ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
}
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(galloc->bufts[talloc->buffer_id], node);
return talloc->size_max >= node_size;
}
static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
if (galloc->n_nodes != graph->n_nodes) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
#endif
return true;
}
if (galloc->n_leafs != graph->n_leafs) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
#endif
return true;
}
@ -839,7 +843,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
#endif
return true;
}
@ -851,7 +855,7 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
}
if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
#endif
return true;
}
@ -865,14 +869,14 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
if (ggml_gallocr_needs_realloc(galloc, graph)) {
if (galloc->n_buffers == 1) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
#endif
if (!ggml_gallocr_reserve(galloc, graph)) {
return false;
}
} else {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
#endif
return false;
}
@ -936,7 +940,7 @@ static bool alloc_tensor_range(struct ggml_context * ctx,
ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
for (size_t i = 0; i < *n_buffers; i++) {
ggml_backend_buffer_free((*buffers)[i]);
@ -986,7 +990,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
}
if (this_size > max_size) {
GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
__func__, t->name,
ggml_backend_buft_name(buft),
this_size, max_size);
@ -1018,7 +1022,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
if (n_buffers == 0) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
return NULL;
}

View File

@ -1,107 +0,0 @@
if (CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM_LWR MATCHES "^(x86_64|i686|amd64|x64|win32)$" OR
(NOT CMAKE_OSX_ARCHITECTURES AND NOT CMAKE_GENERATOR_PLATFORM_LWR AND
CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$") AND
CMAKE_COMPILER_IS_GNUCC AND CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 11.0)
message(STATUS "Using AMX")
file(GLOB GGML_HEADERS_AMX "*.h")
list(APPEND GGML_HEADERS_AMX "../../include/ggml-amx.h")
file(GLOB GGML_SOURCES_AMX "*.cpp")
add_library(ggml-amx
${GGML_HEADERS_AMX}
${GGML_SOURCES_AMX})
target_link_libraries(ggml-amx PRIVATE ggml-base)
target_include_directories(ggml-amx PRIVATE . ..)
# this is duplicated from the CPU backend, since the AMX backend also depends on the architecture flags
# TODO: integrate AMX backend into the CPU backend
if (MSVC)
# instruction set detection for MSVC only
if (GGML_NATIVE)
# TODO: improve, should not reference files from the parent folder
include(../ggml-cpu/cmake/FindSIMD.cmake)
endif ()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS /arch:AVX512)
# MSVC has no compile-time flags enabling specific
# AVX512 extensions, neither it defines the
# macros corresponding to the extensions.
# Do it manually.
if (GGML_AVX512_VBMI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
endif()
if (GGML_AVX512_VNNI)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
endif()
if (GGML_AVX512_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AVX512BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AVX512BF16__>)
endif()
if (GGML_AMX_TILE)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_TILE__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_TILE__>)
endif()
if (GGML_AMX_INT8)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_INT8__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_INT8__>)
endif()
if (GGML_AMX_BF16)
add_compile_definitions($<$<COMPILE_LANGUAGE:C>:__AMX_BF16__>)
add_compile_definitions($<$<COMPILE_LANGUAGE:CXX>:__AMX_BF16__>)
endif()
elseif (GGML_AVX2)
list(APPEND ARCH_FLAGS /arch:AVX2)
elseif (GGML_AVX)
list(APPEND ARCH_FLAGS /arch:AVX)
endif()
else()
if (GGML_NATIVE)
list(APPEND ARCH_FLAGS -march=native)
endif()
if (GGML_F16C)
list(APPEND ARCH_FLAGS -mf16c)
endif()
if (GGML_FMA)
list(APPEND ARCH_FLAGS -mfma)
endif()
if (GGML_AVX)
list(APPEND ARCH_FLAGS -mavx)
endif()
if (GGML_AVX2)
list(APPEND ARCH_FLAGS -mavx2)
endif()
if (GGML_AVX512)
list(APPEND ARCH_FLAGS -mavx512f)
list(APPEND ARCH_FLAGS -mavx512dq)
list(APPEND ARCH_FLAGS -mavx512bw)
endif()
if (GGML_AVX512_VBMI)
list(APPEND ARCH_FLAGS -mavx512vbmi)
endif()
if (GGML_AVX512_VNNI)
list(APPEND ARCH_FLAGS -mavx512vnni)
endif()
if (GGML_AVX512_BF16)
list(APPEND ARCH_FLAGS -mavx512bf16)
endif()
if (GGML_AMX_TILE)
list(APPEND ARCH_FLAGS -mamx-tile)
endif()
if (GGML_AMX_INT8)
list(APPEND ARCH_FLAGS -mamx-int8)
endif()
if (GGML_AMX_BF16)
list(APPEND ARCH_FLAGS -mamx-bf16)
endif()
endif()
target_compile_options(ggml-amx PRIVATE ${ARCH_FLAGS})
else()
set(GGML_AMX OFF PARENT_SCOPE)
message(WARNING "AMX requires x86 and gcc version > 11.0. Turning off GGML_AMX.")
endif()

View File

@ -1,94 +0,0 @@
#pragma once
#include "ggml.h"
// hack until AMX is moved into the CPU backend
#include "../ggml-cpu/ggml-cpu-impl.h" // <immintrin.h>
#include <algorithm>
#include <memory>
#include <type_traits>
#if defined(_OPENMP)
#include <omp.h>
#endif
#define TILE_M 16
#define TILE_N 16
#define TILE_K 32
#define VNNI_BLK 4
#define AMX_BLK_SIZE 32
#define TMM0 0
#define TMM1 1
#define TMM2 2
#define TMM3 3
#define TMM4 4
#define TMM5 5
#define TMM6 6
#define TMM7 7
// parallel routines
template <typename T, typename std::enable_if<std::is_integral<T>::value, int>::type = 0>
inline T div_up(T x, T y) { return (x + y - 1) / y; }
template <typename T>
inline void balance211(T n, T nth, T ith, T& n_start, T& n_end) {
#if 0
// onednn partition pattern
T& n_my = n_end;
if (nth <= 1 || n == 0) {
n_start = 0;
n_my = n;
} else {
T n1 = div_up(n, nth);
T n2 = n1 - 1;
T T1 = n - n2 * nth;
n_my = ith < T1 ? n1 : n2;
n_start = ith <= T1 ? ith*n1 : T1 * n1 + (ith - T1) * n2;
}
n_end += n_start;
#else
// pytorch aten partition pattern
T n_my = div_up(n, nth);
n_start = ith * n_my;
n_end = std::min(n_start + n_my, n);
#endif
}
template <typename func_t>
inline void parallel_for(int nth, int n, const func_t& f) {
#if defined(_OPENMP)
#pragma omp parallel num_threads(nth)
{
//int nth = omp_get_num_threads();
int ith = omp_get_thread_num();
int tbegin, tend;
balance211(n, nth, ith, tbegin, tend);
f(tbegin, tend);
}
#else
f(0, n);
GGML_UNUSED(nth);
#endif
}
// quantized types that have AMX support
inline bool qtype_has_amx_kernels(const enum ggml_type type) {
// TODO: fix padding for vnni format
return (type == GGML_TYPE_Q4_0) ||
(type == GGML_TYPE_Q4_1);
//(type == GGML_TYPE_Q8_0) ||
//(type == GGML_TYPE_Q4_K) ||
//(type == GGML_TYPE_Q5_K) ||
//(type == GGML_TYPE_Q6_K) ||
//(type == GGML_TYPE_IQ4_XS);
}
// ggml backend context
struct ggml_backend_amx_context {
int n_threads = GGML_DEFAULT_N_THREADS;
std::unique_ptr<char[]> work_data;
size_t work_size = 0;
};

View File

@ -1,446 +0,0 @@
#include "ggml-amx.h"
#include "ggml-amx/common.h"
#include "ggml-amx/mmq.h"
#include "ggml-backend-impl.h"
#include "ggml-impl.h"
#if defined(__gnu_linux__)
#include <sys/syscall.h>
#include <unistd.h>
#endif
#include <cstdlib>
#include <cstring>
#include <memory>
#if defined(__AMX_INT8__)
// AMX buffer interface
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
static void * ggml_backend_amx_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)(buffer->context);
}
static void ggml_backend_amx_buffer_memset_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
memset((char *)tensor->data + offset, value, size);
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
if (qtype_has_amx_kernels(tensor->type)) {
ggml_backend_amx_convert_weight(tensor, data, offset, size);
} else {
memcpy((char *)tensor->data + offset, data, size);
}
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(!qtype_has_amx_kernels(tensor->type));
memcpy(data, (const char *)tensor->data + offset, size);
GGML_UNUSED(buffer);
}
static bool ggml_backend_amx_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
if (ggml_backend_buffer_is_host(src->buffer)) {
if (qtype_has_amx_kernels(src->type)) {
ggml_backend_amx_convert_weight(dst, src->data, 0, ggml_backend_amx_get_alloc_size(dst));
} else {
memcpy(dst->data, src->data, ggml_nbytes(src));
}
return true;
}
return false;
GGML_UNUSED(buffer);
}
static void ggml_backend_amx_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
memset(buffer->context, value, buffer->size);
}
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .memset_tensor = */ ggml_backend_amx_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_amx_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_amx_buffer_get_tensor,
/* .cpy_tensor = */ ggml_backend_amx_buffer_cpy_tensor,
/* .clear = */ ggml_backend_amx_buffer_clear,
/* .reset = */ NULL,
};
static const char * ggml_backend_amx_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "AMX";
GGML_UNUSED(buft);
}
static ggml_backend_buffer_t ggml_backend_amx_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
void * data = aligned_alloc(TENSOR_ALIGNMENT, size);
if (data == NULL) {
fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}
return ggml_backend_buffer_init(buft, ggml_backend_amx_buffer_interface, data, size);
}
static size_t ggml_backend_amx_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return TENSOR_ALIGNMENT;
GGML_UNUSED(buft);
}
static size_t ggml_backend_amx_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor* tensor) {
return ggml_backend_amx_get_alloc_size(tensor);
GGML_UNUSED(buft);
}
static bool ggml_backend_amx_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
GGML_UNUSED(buft);
}
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
/* .get_name = */ ggml_backend_amx_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ NULL,
};
return &ggml_backend_buffer_type_amx;
}
// backend interface
static const char * ggml_backend_amx_name(ggml_backend_t backend) {
return "AMX";
GGML_UNUSED(backend);
}
static void ggml_backend_amx_free(ggml_backend_t backend) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
delete ctx;
delete backend;
}
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
for (int i = 0; i < cgraph->n_nodes; i++) {
struct ggml_tensor * node = cgraph->nodes[i];
switch (node->op) {
case GGML_OP_MUL_MAT:
ggml_backend_amx_mul_mat(ctx, node);
break;
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
break;
default:
fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
GGML_ASSERT(false);
}
}
return GGML_STATUS_SUCCESS;
GGML_UNUSED(backend);
}
static struct ggml_backend_i ggml_backend_amx_i = {
/* .get_name = */ ggml_backend_amx_name,
/* .free = */ ggml_backend_amx_free,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_amx_graph_compute,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
static ggml_guid_t ggml_backend_amx_guid() {
static ggml_guid guid = { 0x13, 0xb8, 0xa4, 0xc4, 0xba, 0xfe, 0x51, 0x67, 0x87, 0x44, 0x55, 0x15, 0xb2, 0x35, 0x62, 0x3e };
return &guid;
}
#define ARCH_GET_XCOMP_PERM 0x1022
#define ARCH_REQ_XCOMP_PERM 0x1023
#define XFEATURE_XTILECFG 17
#define XFEATURE_XTILEDATA 18
static bool ggml_amx_init() {
#if defined(__gnu_linux__)
if (syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA)) {
fprintf(stderr, "AMX is not ready to be used!\n");
return false;
}
return true;
#elif defined(_WIN32)
return true;
#endif
}
ggml_backend_t ggml_backend_amx_init() {
// invoke a Linux system call to request access to AMX features
ggml_amx_init();
// backend context
ggml_backend_amx_context * ctx = new ggml_backend_amx_context;
// ggml amx backend
ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_amx_guid(),
/* .interface = */ ggml_backend_amx_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ ctx,
};
return backend;
}
bool ggml_backend_is_amx(ggml_backend_t backend) {
return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_amx_guid());
}
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
GGML_ASSERT(ggml_backend_is_amx(backend_amx));
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend_amx->context;
ctx->n_threads = n_threads;
}
// device interface
static const char * ggml_backend_amx_device_get_name(ggml_backend_dev_t dev) {
return "AMX";
GGML_UNUSED(dev);
}
static const char * ggml_backend_amx_device_get_description(ggml_backend_dev_t dev) {
return "Intel Advanced Matrix Extensions";
GGML_UNUSED(dev);
}
static void ggml_backend_amx_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
// TODO
*free = 0;
*total = 0;
GGML_UNUSED(dev);
}
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
GGML_UNUSED(dev);
}
static void ggml_backend_amx_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_amx_device_get_name(dev);
props->description = ggml_backend_amx_device_get_description(dev);
props->type = ggml_backend_amx_device_get_type(dev);
ggml_backend_amx_device_get_memory(dev, &props->memory_free, &props->memory_total);
// `buffer_from_host_ptr` is intended to be used in mmap, when memory layout unchanged
props->caps = {
/* .async = */ false,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ false,
/* .events = */ false,
};
}
static ggml_backend_t ggml_backend_amx_device_init(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_amx_init();
GGML_UNUSED(dev);
GGML_UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_amx_device_get_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_amx_buffer_type();
GGML_UNUSED(dev);
}
static bool ggml_backend_amx_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
// handle only 2d gemm for now
auto is_contiguous_2d = [](const struct ggml_tensor * t) {
return ggml_is_contiguous(t) && t->ne[3] == 1 && t->ne[2] == 1;
};
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_MUL_MAT: {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const enum ggml_type type = src0->type;
const int64_t ne0 = op->ne[0];
// amx kernels enables for Q4_0, Q4_1, Q8_0, F16
// Q4_K, Q5_K, Q6_K, IQ4_XS enabled for QK_K = 256
bool has_amx_kernels = qtype_has_amx_kernels(type) || (type == GGML_TYPE_F16);
bool can_use_amx =
is_contiguous_2d(src0) && // src0 must be contiguous
is_contiguous_2d(src1) && // src1 must be contiguous
src1->type == GGML_TYPE_F32 && // src1 must be float32
has_amx_kernels && // with amx kernel impls
ne0 % (TILE_N * 2) == 0; // out_features is 32x
return can_use_amx;
}
default:
return false;
}
GGML_UNUSED(dev);
}
static bool ggml_backend_amx_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_amx_buffer_type_get_name;
GGML_UNUSED(dev);
}
static const struct ggml_backend_device_i ggml_backend_amx_device_i = {
/* .get_name = */ ggml_backend_amx_device_get_name,
/* .get_description = */ ggml_backend_amx_device_get_description,
/* .get_memory = */ ggml_backend_amx_device_get_memory,
/* .get_type = */ ggml_backend_amx_device_get_type,
/* .get_props = */ ggml_backend_amx_device_get_props,
/* .init_backend = */ ggml_backend_amx_device_init,
/* .get_buffer_type = */ ggml_backend_amx_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ NULL,
/* .supports_op = */ ggml_backend_amx_device_supports_op,
/* .supports_buft = */ ggml_backend_amx_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
// backend reg interface
static const char * ggml_backend_amx_reg_get_name(ggml_backend_reg_t reg) {
return "AMX";
GGML_UNUSED(reg);
}
static size_t ggml_backend_amx_reg_get_device_count(ggml_backend_reg_t reg) {
return 1;
GGML_UNUSED(reg);
}
static ggml_backend_dev_t ggml_backend_amx_reg_get_device(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(index == 0);
static ggml_backend_device ggml_backend_amx_device = {
/* .iface = */ ggml_backend_amx_device_i,
/* .reg = */ reg,
/* .context = */ nullptr,
};
return &ggml_backend_amx_device;
GGML_UNUSED(reg);
GGML_UNUSED(index);
}
static void * ggml_backend_amx_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml_backend_amx_set_n_threads;
}
return NULL;
GGML_UNUSED(reg);
GGML_UNUSED(name);
}
static const struct ggml_backend_reg_i ggml_backend_amx_reg_i = {
/* .get_name = */ ggml_backend_amx_reg_get_name,
/* .get_device_count = */ ggml_backend_amx_reg_get_device_count,
/* .get_device = */ ggml_backend_amx_reg_get_device,
/* .get_proc_address = */ ggml_backend_amx_get_proc_address,
};
ggml_backend_reg_t ggml_backend_amx_reg(void) {
static struct ggml_backend_reg ggml_backend_amx_reg = {
/* .iface = */ ggml_backend_amx_reg_i,
/* .context = */ NULL,
};
return &ggml_backend_amx_reg;
}
#else // if defined(__AMX_INT8__)
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type(void) {
return nullptr;
}
bool ggml_backend_is_amx(ggml_backend_t backend) {
GGML_UNUSED(backend);
return false;
}
ggml_backend_t ggml_backend_amx_init(void) {
fprintf(stderr, "GGML is not compiled with AMX support!\n");
return nullptr;
}
void ggml_backend_amx_set_n_threads(ggml_backend_t backend_amx, int n_threads) {
fprintf(stderr, "GGML is not compiled with AMX support!\n");
GGML_UNUSED(backend_amx);
GGML_UNUSED(n_threads);
}
ggml_backend_reg_t ggml_backend_amx_reg(void) {
return nullptr;
}
#endif

File diff suppressed because it is too large Load Diff

View File

@ -1,17 +0,0 @@
#pragma once
#include "common.h"
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
size_t ggml_backend_amx_get_alloc_size(const struct ggml_tensor * tensor);
void ggml_backend_amx_convert_weight(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void ggml_backend_amx_mul_mat(ggml_backend_amx_context * ctx, struct ggml_tensor * dst);
#ifdef __cplusplus
}
#endif

View File

@ -8,8 +8,6 @@
extern "C" {
#endif
#define GGML_BACKEND_API_VERSION 1
//
// Backend buffer type
//
@ -24,7 +22,7 @@ extern "C" {
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
// (optional) check if tensor data is in host memory (defaults to false)
bool (*is_host) (ggml_backend_buffer_type_t buft);
};
@ -39,6 +37,7 @@ extern "C" {
//
struct ggml_backend_buffer_i {
const char * (*get_name) (ggml_backend_buffer_t buffer);
// (optional) free the buffer
void (*free_buffer) (ggml_backend_buffer_t buffer);
// base address of the buffer
@ -65,20 +64,20 @@ extern "C" {
enum ggml_backend_buffer_usage usage;
};
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_t ggml_backend_buffer_init(
ggml_backend_buffer_type_t buft,
struct ggml_backend_buffer_i iface,
void * context,
size_t size);
// do not use directly, use ggml_backend_tensor_copy instead
GGML_API bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
// multi-buffer
// buffer that contains a collection of buffers
GGML_API ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
GGML_API bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
GGML_API void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
//
// Backend (stream)
@ -89,16 +88,18 @@ extern "C" {
void (*free)(ggml_backend_t backend);
// buffer allocation
ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
// (optional) asynchronous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
// (optional) complete all pending operations (required if the backend supports async operations)
// (optional) complete all pending operations
void (*synchronize)(ggml_backend_t backend);
// (optional) graph plans (not used currently)
// compute graph with a plan
// (optional) compute graph with a plan (not used currently)
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
@ -109,6 +110,21 @@ extern "C" {
// compute graph (always async if supported by the backend)
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
// IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
// new backends should implement the device interface instead
// These functions are being moved to the device interface
// check if the backend can compute an operation
bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
// check if the backend can use tensors allocated in a buffer type
bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily
bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization
// record an event on this stream
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
@ -168,8 +184,9 @@ extern "C" {
// check if the backend can use tensors allocated in a buffer type
bool (*supports_buft)(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft);
// (optional) check if the backend wants to run an operation, even if the weights are allocated in an incompatible buffer
// these should be expensive operations that may benefit from running on this backend instead of the CPU backend
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily
bool (*offload_op)(ggml_backend_dev_t dev, const struct ggml_tensor * op);
// (optional) event synchronization
@ -201,55 +218,17 @@ extern "C" {
};
struct ggml_backend_reg {
int api_version; // initialize to GGML_BACKEND_API_VERSION
// int api_version; // TODO: for dynamic loading
struct ggml_backend_reg_i iface;
void * context;
};
// Internal backend registry API
GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// Add backend dynamic loading support to the backend
// Initialize the backend
typedef ggml_backend_reg_t (*ggml_backend_init_t)(void);
// Optional: obtain a score for the backend based on the system configuration
// Higher scores are preferred, 0 means the backend is not supported in the current system
typedef int (*ggml_backend_score_t)(void);
#ifdef GGML_BACKEND_DL
# ifdef __cplusplus
# define GGML_BACKEND_DL_IMPL(reg_fn) \
extern "C" { \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
} \
ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \
}
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
extern "C" { \
GGML_BACKEND_API int ggml_backend_score(void); \
} \
int ggml_backend_score(void) { \
return score_fn(); \
}
# else
# define GGML_BACKEND_DL_IMPL(reg_fn) \
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_init(void); \
ggml_backend_reg_t ggml_backend_init(void) { \
return reg_fn(); \
}
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn) \
GGML_BACKEND_API int ggml_backend_score(void); \
int ggml_backend_score(void) { \
return score_fn(); \
}
# endif
#else
# define GGML_BACKEND_DL_IMPL(reg_fn)
# define GGML_BACKEND_DL_SCORE_IMPL(score_fn)
#endif
void ggml_backend_register(ggml_backend_reg_t reg);
void ggml_backend_device_register(ggml_backend_dev_t device);
// TODO: backends can be loaded as a dynamic library, in which case it needs to export this function
// typedef ggml_backend_register_t * (*ggml_backend_init)(void);
#ifdef __cplusplus
}

View File

@ -1,529 +0,0 @@
#include "ggml-backend-impl.h"
#include "ggml-backend.h"
#include "ggml-impl.h"
#include <algorithm>
#include <codecvt>
#include <cstring>
#include <filesystem>
#include <locale>
#include <memory>
#include <string>
#include <type_traits>
#include <vector>
#ifdef _WIN32
# define WIN32_LEAN_AND_MEAN
# ifndef NOMINMAX
# define NOMINMAX
# endif
# include <windows.h>
#elif defined(__APPLE__)
# include <mach-o/dyld.h>
# include <dlfcn.h>
#else
# include <dlfcn.h>
# include <unistd.h>
#endif
// Backend registry
#ifdef GGML_USE_CPU
#include "ggml-cpu.h"
#endif
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#ifdef GGML_USE_SYCL
#include "ggml-sycl.h"
#endif
#ifdef GGML_USE_VULKAN
#include "ggml-vulkan.h"
#endif
#ifdef GGML_USE_BLAS
#include "ggml-blas.h"
#endif
#ifdef GGML_USE_RPC
#include "ggml-rpc.h"
#endif
#ifdef GGML_USE_CANN
#include "ggml-cann.h"
#endif
#ifdef GGML_USE_KOMPUTE
#include "ggml-kompute.h"
#endif
#ifdef _WIN32
using dl_handle = std::remove_pointer_t<HMODULE>;
struct dl_handle_deleter {
void operator()(HMODULE handle) {
FreeLibrary(handle);
}
};
static dl_handle * dl_load_library(const std::wstring & path) {
// suppress error dialogs for missing DLLs
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
HMODULE handle = LoadLibraryW(path.c_str());
SetErrorMode(old_mode);
return handle;
}
static dl_handle * dl_load_library(const std::string & path) {
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> converter;
return dl_load_library(converter.from_bytes(path));
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
DWORD old_mode = SetErrorMode(SEM_FAILCRITICALERRORS);
SetErrorMode(old_mode | SEM_FAILCRITICALERRORS);
void * p = (void *) GetProcAddress(handle, name);
SetErrorMode(old_mode);
return p;
}
#else
using dl_handle = void;
struct dl_handle_deleter {
void operator()(void * handle) {
dlclose(handle);
}
};
static void * dl_load_library(const std::string & path) {
dl_handle * handle = dlopen(path.c_str(), RTLD_NOW | RTLD_LOCAL);
return handle;
}
static void * dl_get_sym(dl_handle * handle, const char * name) {
return dlsym(handle, name);
}
#endif
using dl_handle_ptr = std::unique_ptr<dl_handle, dl_handle_deleter>;
struct ggml_backend_reg_entry {
ggml_backend_reg_t reg;
dl_handle_ptr handle;
};
struct ggml_backend_registry {
std::vector<ggml_backend_reg_entry> backends;
std::vector<ggml_backend_dev_t> devices;
ggml_backend_registry() {
#ifdef GGML_USE_CUDA
register_backend(ggml_backend_cuda_reg());
#endif
#ifdef GGML_USE_METAL
register_backend(ggml_backend_metal_reg());
#endif
#ifdef GGML_USE_SYCL
register_backend(ggml_backend_sycl_reg());
#endif
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
#ifdef GGML_USE_CANN
register_backend(ggml_backend_cann_reg());
#endif
#ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg());
#endif
#ifdef GGML_USE_RPC
register_backend(ggml_backend_rpc_reg());
#endif
#ifdef GGML_USE_KOMPUTE
register_backend(ggml_backend_kompute_reg());
#endif
#ifdef GGML_USE_CPU
register_backend(ggml_backend_cpu_reg());
#endif
}
~ggml_backend_registry() {
// FIXME: backends cannot be safely unloaded without a function to destroy all the backend resources,
// since backend threads may still be running and accessing resources from the dynamic library
for (auto & entry : backends) {
if (entry.handle) {
entry.handle.release(); // NOLINT
}
}
}
void register_backend(ggml_backend_reg_t reg, dl_handle_ptr handle = nullptr) {
if (!reg) {
return;
}
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered backend %s (%zu devices)\n",
__func__, ggml_backend_reg_name(reg), ggml_backend_reg_dev_count(reg));
#endif
backends.push_back({ reg, std::move(handle) });
for (size_t i = 0; i < ggml_backend_reg_dev_count(reg); i++) {
register_device(ggml_backend_reg_dev_get(reg, i));
}
}
void register_device(ggml_backend_dev_t device) {
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: registered device %s (%s)\n", __func__, ggml_backend_dev_name(device), ggml_backend_dev_description(device));
#endif
devices.push_back(device);
}
ggml_backend_reg_t load_backend(const char * path, bool silent) {
dl_handle_ptr handle { dl_load_library(path) };
if (!handle) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path);
}
return nullptr;
}
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn && score_fn() == 0) {
if (!silent) {
GGML_LOG_INFO("%s: backend %s is not supported on this system\n", __func__, path);
}
return nullptr;
}
auto backend_init_fn = (ggml_backend_init_t) dl_get_sym(handle.get(), "ggml_backend_init");
if (!backend_init_fn) {
if (!silent) {
GGML_LOG_ERROR("%s: failed to find ggml_backend_init in %s\n", __func__, path);
}
return nullptr;
}
ggml_backend_reg_t reg = backend_init_fn();
if (!reg || reg->api_version != GGML_BACKEND_API_VERSION) {
if (!silent) {
if (!reg) {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: ggml_backend_init returned NULL\n", __func__, path);
} else {
GGML_LOG_ERROR("%s: failed to initialize backend from %s: incompatible API version (backend: %d, current: %d)\n",
__func__, path, reg->api_version, GGML_BACKEND_API_VERSION);
}
}
return nullptr;
}
GGML_LOG_INFO("%s: loaded %s backend from %s\n", __func__, ggml_backend_reg_name(reg), path);
register_backend(reg, std::move(handle));
return reg;
}
void unload_backend(ggml_backend_reg_t reg, bool silent) {
auto it = std::find_if(backends.begin(), backends.end(),
[reg](const ggml_backend_reg_entry & entry) { return entry.reg == reg; });
if (it == backends.end()) {
if (!silent) {
GGML_LOG_ERROR("%s: backend not found\n", __func__);
}
return;
}
if (!silent) {
GGML_LOG_DEBUG("%s: unloading %s backend\n", __func__, ggml_backend_reg_name(reg));
}
// remove devices
devices.erase(
std::remove_if(devices.begin(), devices.end(),
[reg](ggml_backend_dev_t dev) { return ggml_backend_dev_backend_reg(dev) == reg; }),
devices.end());
// remove backend
backends.erase(it);
}
};
static ggml_backend_registry & get_reg() {
static ggml_backend_registry reg;
return reg;
}
// Internal API
void ggml_backend_register(ggml_backend_reg_t reg) {
get_reg().register_backend(reg);
}
void ggml_backend_device_register(ggml_backend_dev_t device) {
get_reg().register_device(device);
}
// Backend (reg) enumeration
static bool striequals(const char * a, const char * b) {
for (; *a && *b; a++, b++) {
if (std::tolower(*a) != std::tolower(*b)) {
return false;
}
}
return *a == *b;
}
size_t ggml_backend_reg_count() {
return get_reg().backends.size();
}
ggml_backend_reg_t ggml_backend_reg_get(size_t index) {
GGML_ASSERT(index < ggml_backend_reg_count());
return get_reg().backends[index].reg;
}
ggml_backend_reg_t ggml_backend_reg_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
ggml_backend_reg_t reg = ggml_backend_reg_get(i);
if (striequals(ggml_backend_reg_name(reg), name)) {
return reg;
}
}
return nullptr;
}
// Device enumeration
size_t ggml_backend_dev_count() {
return get_reg().devices.size();
}
ggml_backend_dev_t ggml_backend_dev_get(size_t index) {
GGML_ASSERT(index < ggml_backend_dev_count());
return get_reg().devices[index];
}
ggml_backend_dev_t ggml_backend_dev_by_name(const char * name) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (striequals(ggml_backend_dev_name(dev), name)) {
return dev;
}
}
return nullptr;
}
ggml_backend_dev_t ggml_backend_dev_by_type(enum ggml_backend_dev_type type) {
for (size_t i = 0; i < ggml_backend_dev_count(); i++) {
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
if (ggml_backend_dev_type(dev) == type) {
return dev;
}
}
return nullptr;
}
// Convenience functions
ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_name(name);
if (!dev) {
return nullptr;
}
return ggml_backend_dev_init(dev, params);
}
ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(type);
if (!dev) {
return nullptr;
}
return ggml_backend_dev_init(dev, params);
}
ggml_backend_t ggml_backend_init_best(void) {
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!dev) {
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
if (!dev) {
return nullptr;
}
return ggml_backend_dev_init(dev, nullptr);
}
// Dynamic loading
ggml_backend_reg_t ggml_backend_load(const char * path) {
return get_reg().load_backend(path, false);
}
void ggml_backend_unload(ggml_backend_reg_t reg) {
get_reg().unload_backend(reg, true);
}
static std::string get_executable_path() {
#if defined(__APPLE__)
// get executable path
std::vector<char> path;
uint32_t size;
while (true) {
size = path.size();
if (_NSGetExecutablePath(path.data(), &size) == 0) {
break;
}
path.resize(size);
}
std::string base_path(path.data(), size);
// remove executable name
auto last_slash = base_path.find_last_of('/');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
return base_path + "/";
#elif defined(__linux__)
std::string base_path = ".";
std::vector<char> path(1024);
while (true) {
// get executable path
ssize_t len = readlink("/proc/self/exe", path.data(), path.size());
if (len == -1) {
break;
}
if (len < (ssize_t) path.size()) {
base_path = std::string(path.data(), len);
// remove executable name
auto last_slash = base_path.find_last_of('/');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
break;
}
path.resize(path.size() * 2);
}
return base_path + "/";
#elif defined(_WIN32)
std::vector<char> path(MAX_PATH);
DWORD len = GetModuleFileNameA(NULL, path.data(), path.size());
if (len == 0) {
return "";
}
std::string base_path(path.data(), len);
// remove executable name
auto last_slash = base_path.find_last_of('\\');
if (last_slash != std::string::npos) {
base_path = base_path.substr(0, last_slash);
}
return base_path + "\\";
#endif
}
static std::string backend_filename_prefix() {
#ifdef _WIN32
return "ggml-";
#else
return "libggml-";
#endif
}
static std::string backend_filename_suffix() {
#ifdef _WIN32
return ".dll";
#else
return ".so";
#endif
}
static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent) {
// enumerate all the files that match [lib]ggml-name-*.[so|dll] in the search paths
// TODO: search system paths
std::vector<std::string> search_paths = { "./", get_executable_path() };
std::string file_prefix = backend_filename_prefix() + name + "-";
int best_score = 0;
std::string best_path;
namespace fs = std::filesystem;
for (const auto & search_path : search_paths) {
if (!fs::exists(search_path)) {
continue;
}
for (const auto & entry : fs::directory_iterator(search_path)) {
if (entry.is_regular_file()) {
std::string filename = entry.path().filename().string();
std::string ext = entry.path().extension().string();
if (filename.find(file_prefix) == 0 && ext == backend_filename_suffix()) {
dl_handle_ptr handle { dl_load_library(entry.path().c_str()) };
if (!handle && !silent) {
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, entry.path().string().c_str());
}
if (handle) {
auto score_fn = (ggml_backend_score_t) dl_get_sym(handle.get(), "ggml_backend_score");
if (score_fn) {
int s = score_fn();
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: %s score: %d\n", __func__, entry.path().string().c_str(), s);
#endif
if (s > best_score) {
best_score = s;
best_path = entry.path().string();
}
} else {
if (!silent) {
GGML_LOG_INFO("%s: failed to find ggml_backend_score in %s\n", __func__, entry.path().string().c_str());
}
}
}
}
}
}
}
if (best_score == 0) {
// try to load the base backend
for (const auto & search_path : search_paths) {
std::string path = search_path + backend_filename_prefix() + name + backend_filename_suffix();
if (fs::exists(path)) {
return get_reg().load_backend(path.c_str(), silent);
}
}
return nullptr;
}
return get_reg().load_backend(best_path.c_str(), silent);
}
void ggml_backend_load_all() {
#ifdef NDEBUG
bool silent = true;
#else
bool silent = false;
#endif
ggml_backend_load_best("blas", silent);
ggml_backend_load_best("cann", silent);
ggml_backend_load_best("cuda", silent);
ggml_backend_load_best("hip", silent);
ggml_backend_load_best("kompute", silent);
ggml_backend_load_best("metal", silent);
ggml_backend_load_best("rpc", silent);
ggml_backend_load_best("sycl", silent);
ggml_backend_load_best("vulkan", silent);
ggml_backend_load_best("musa", silent);
ggml_backend_load_best("cpu", silent);
}

File diff suppressed because it is too large Load Diff

View File

@ -4,9 +4,8 @@
#include <future>
#include <vector>
#include <cstring>
#if defined(GGML_BLAS_USE_ACCELERATE)
#if defined(GGML_USE_ACCELERATE)
# include <Accelerate/Accelerate.h>
#elif defined(GGML_BLAS_USE_MKL)
# include <mkl.h>
@ -27,6 +26,30 @@ struct ggml_backend_blas_context {
#endif
};
// helper function to determine if it is better to use BLAS or not
// for large matrices, BLAS is faster
static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = dst->ne[0];
const int64_t ne1 = dst->ne[1];
// TODO: find the optimal values for these
if (ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
src1->type == GGML_TYPE_F32 &&
(ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
/*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
return true;
}
return false;
}
static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
@ -65,8 +88,8 @@ static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct gg
// convert src0 to float
if (type != GGML_TYPE_F32) {
const auto * type_traits = ggml_get_type_traits(type);
ggml_to_float_t const to_float = type_traits->to_float;
ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
ggml_to_float_t const to_float = type_traits.to_float;
for (int64_t i03 = 0; i03 < ne03; i03++) {
for (int64_t i02 = 0; i02 < ne02; i02++) {
@ -212,7 +235,7 @@ static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct g
// backend interface
static const char * ggml_backend_blas_get_name(ggml_backend_t backend) {
static const char * ggml_backend_blas_name(ggml_backend_t backend) {
return "BLAS";
GGML_UNUSED(backend);
@ -224,6 +247,12 @@ static void ggml_backend_blas_free(ggml_backend_t backend) {
delete backend;
}
static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
return ggml_backend_cpu_buffer_type();
GGML_UNUSED(backend);
}
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
@ -256,9 +285,31 @@ static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend,
GGML_UNUSED(backend);
}
static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) ||
(op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 &&
op->src[1]->type == GGML_TYPE_F32 &&
ggml_is_matrix(src0) &&
ggml_is_matrix(src1) &&
ggml_is_contiguous(src0) &&
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
GGML_UNUSED(backend);
}
static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
GGML_UNUSED(backend);
}
static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_get_name,
/* .get_name = */ ggml_backend_blas_name,
/* .free = */ ggml_backend_blas_free,
/* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
@ -268,6 +319,9 @@ static struct ggml_backend_i blas_backend_i = {
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_blas_graph_compute,
/* .supports_op = */ ggml_backend_blas_supports_op,
/* .supports_buft = */ ggml_backend_blas_supports_buft,
/* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
@ -283,18 +337,18 @@ ggml_backend_t ggml_backend_blas_init(void) {
ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_blas_guid(),
/* .interface = */ blas_backend_i,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_blas_reg(), 0),
/* .device = */ nullptr,
/* .context = */ ctx,
};
#if defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
if (openblas_get_parallel() != OPENBLAS_OPENMP) {
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
}
#endif
#if defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
GGML_LOG_DEBUG("%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
#endif
return backend;
@ -310,208 +364,3 @@ void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads)
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
ctx->n_threads = n_threads;
}
// device interface
static const char * ggml_backend_blas_device_get_name(ggml_backend_dev_t dev) {
return "BLAS";
GGML_UNUSED(dev);
}
static const char * ggml_backend_blas_device_get_description(ggml_backend_dev_t dev) {
#if defined(GGML_BLAS_USE_ACCELERATE)
return "Accelerate";
#elif defined(GGML_BLAS_USE_MKL)
return "MKL";
#elif defined(GGML_BLAS_USE_BLIS)
return "BLIS";
#elif defined(GGML_BLAS_USE_NVPL)
return "NVPL";
#elif defined(OPENBLAS_VERSION)
return "OpenBLAS";
#else
return "BLAS";
#endif
GGML_UNUSED(dev);
}
static void ggml_backend_blas_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
// TODO
*free = 0;
*total = 0;
GGML_UNUSED(dev);
}
static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
return GGML_BACKEND_DEVICE_TYPE_ACCEL;
GGML_UNUSED(dev);
}
static void ggml_backend_blas_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->name = ggml_backend_blas_device_get_name(dev);
props->description = ggml_backend_blas_device_get_description(dev);
props->type = ggml_backend_blas_device_get_type(dev);
ggml_backend_blas_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
/* .async = */ false,
/* .host_buffer = */ false,
/* .buffer_from_host_ptr = */ true,
/* .events = */ false,
};
}
static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_blas_init();
GGML_UNUSED(dev);
GGML_UNUSED(params);
}
static ggml_backend_buffer_type_t ggml_backend_blas_device_get_buffer_type(ggml_backend_dev_t dev) {
return ggml_backend_cpu_buffer_type();
GGML_UNUSED(dev);
}
static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
GGML_UNUSED(dev);
GGML_UNUSED(max_tensor_size);
}
static bool ggml_backend_blas_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
switch (op->op) {
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
return true;
case GGML_OP_MUL_MAT:
{
// BLAS usually is only faster for large matrices
const struct ggml_tensor * src0 = op->src[0];
const struct ggml_tensor * src1 = op->src[1];
const int64_t ne10 = src1->ne[0];
const int64_t ne0 = op->ne[0];
const int64_t ne1 = op->ne[1];
// TODO: find the optimal value
const int64_t min_batch = 32;
return ggml_is_contiguous(src0) &&
ggml_is_contiguous(src1) &&
src1->type == GGML_TYPE_F32 &&
(ne0 >= min_batch && ne1 >= min_batch && ne10 >= min_batch) &&
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
}
case GGML_OP_OUT_PROD:
return op->src[0]->type == GGML_TYPE_F32 &&
op->src[1]->type == GGML_TYPE_F32 &&
ggml_is_matrix(src0) &&
ggml_is_matrix(src1) &&
ggml_is_contiguous(src0) &&
(ggml_is_contiguous(src1) || ggml_is_transposed(src1)) &&
(src0->type == GGML_TYPE_F32 || ggml_get_type_traits(src0->type)->to_float != NULL);
default:
return false;
}
GGML_UNUSED(dev);
}
static bool ggml_backend_blas_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
return ggml_backend_buft_is_host(buft);
GGML_UNUSED(dev);
}
static const struct ggml_backend_device_i ggml_backend_blas_device_i = {
/* .get_name = */ ggml_backend_blas_device_get_name,
/* .get_description = */ ggml_backend_blas_device_get_description,
/* .get_memory = */ ggml_backend_blas_device_get_memory,
/* .get_type = */ ggml_backend_blas_device_get_type,
/* .get_props = */ ggml_backend_blas_device_get_props,
/* .init_backend = */ ggml_backend_blas_device_init_backend,
/* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
/* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
/* .supports_op = */ ggml_backend_blas_device_supports_op,
/* .supports_buft = */ ggml_backend_blas_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
/* .event_synchronize = */ NULL,
};
// backend reg interface
static const char * ggml_backend_blas_reg_get_name(ggml_backend_reg_t reg) {
return "BLAS";
GGML_UNUSED(reg);
}
static size_t ggml_backend_blas_reg_get_device_count(ggml_backend_reg_t reg) {
return 1;
GGML_UNUSED(reg);
}
static ggml_backend_dev_t ggml_backend_blas_reg_get_device(ggml_backend_reg_t reg, size_t index) {
GGML_ASSERT(index == 0);
static ggml_backend_device ggml_backend_blas_device = {
/* .iface = */ ggml_backend_blas_device_i,
/* .reg = */ reg,
/* .context = */ nullptr,
};
return &ggml_backend_blas_device;
GGML_UNUSED(reg);
GGML_UNUSED(index);
}
static void * ggml_backend_blas_get_proc_address(ggml_backend_reg_t reg, const char * name) {
if (std::strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml_backend_blas_set_n_threads;
}
return NULL;
GGML_UNUSED(reg);
GGML_UNUSED(name);
}
static const struct ggml_backend_reg_i ggml_backend_blas_reg_i = {
/* .get_name = */ ggml_backend_blas_reg_get_name,
/* .get_device_count = */ ggml_backend_blas_reg_get_device_count,
/* .get_device = */ ggml_backend_blas_reg_get_device,
/* .get_proc_address = */ ggml_backend_blas_get_proc_address,
};
ggml_backend_reg_t ggml_backend_blas_reg(void) {
static struct ggml_backend_reg ggml_backend_blas_reg = {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_blas_reg_i,
/* .context = */ NULL,
};
return &ggml_backend_blas_reg;
}
GGML_BACKEND_DL_IMPL(ggml_backend_blas_reg)

View File

@ -1,87 +0,0 @@
if (GGML_STATIC)
set(BLA_STATIC ON)
endif()
#if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
# set(BLA_SIZEOF_INTEGER 8)
#endif()
set(BLA_VENDOR ${GGML_BLAS_VENDOR})
find_package(BLAS)
if (BLAS_FOUND)
message(STATUS "BLAS found, Libraries: ${BLAS_LIBRARIES}")
ggml_add_backend_library(ggml-blas
ggml-blas.cpp
)
if (${GGML_BLAS_VENDOR} MATCHES "Apple")
add_compile_definitions(ACCELERATE_NEW_LAPACK)
add_compile_definitions(ACCELERATE_LAPACK_ILP64)
add_compile_definitions(GGML_BLAS_USE_ACCELERATE)
elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "")
# BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
# see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
find_package(PkgConfig REQUIRED)
if (${GGML_BLAS_VENDOR} MATCHES "Generic")
pkg_check_modules(DepBLAS blas)
elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
# As of openblas v0.3.22, the 64-bit is named openblas64.pc
pkg_check_modules(DepBLAS openblas64)
if (NOT DepBLAS_FOUND)
pkg_check_modules(DepBLAS openblas)
endif()
elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
add_compile_definitions(GGML_BLAS_USE_BLIS)
pkg_check_modules(DepBLAS blis)
elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
pkg_check_modules(DepBLAS blas-atlas)
elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
pkg_check_modules(DepBLAS flexiblas_api)
elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
add_compile_definitions(GGML_BLAS_USE_MKL)
# all Intel* libraries share the same include path
pkg_check_modules(DepBLAS mkl-sdl)
elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
# this doesn't provide pkg-config
# suggest to assign BLAS_INCLUDE_DIRS on your own
if ("${NVHPC_VERSION}" STREQUAL "")
message(WARNING "Better to set NVHPC_VERSION")
else()
set(DepBLAS_FOUND ON)
set(DepBLAS_INCLUDE_DIRS "/opt/nvidia/hpc_sdk/${CMAKE_SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR}/${NVHPC_VERSION}/math_libs/include")
endif()
endif()
if (DepBLAS_FOUND)
set(BLAS_INCLUDE_DIRS ${DepBLAS_INCLUDE_DIRS})
else()
message(WARNING "BLAS_INCLUDE_DIRS neither been provided nor been automatically"
" detected by pkgconfig, trying to find cblas.h from possible paths...")
find_path(BLAS_INCLUDE_DIRS
NAMES cblas.h
HINTS
/usr/include
/usr/local/include
/usr/include/openblas
/opt/homebrew/opt/openblas/include
/usr/local/opt/openblas/include
/usr/include/x86_64-linux-gnu/openblas/include
)
endif()
endif()
message(STATUS "BLAS found, Includes: ${BLAS_INCLUDE_DIRS}")
target_compile_options(ggml-blas PRIVATE ${BLAS_LINKER_FLAGS})
if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
add_compile_definitions(GGML_BLAS_USE_MKL)
endif()
target_link_libraries (ggml-blas PRIVATE ${BLAS_LIBRARIES})
target_include_directories(ggml-blas PRIVATE ${BLAS_INCLUDE_DIRS})
else()
message(ERROR "BLAS not found, please refer to "
"https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
" to set correct GGML_BLAS_VENDOR")
endif()

View File

@ -39,8 +39,6 @@
#include "ggml-common.h"
#define GGML_CANN_NAME "CANN"
/**
* @brief Handles CANN errors by printing an error message and aborting.
*
@ -122,10 +120,6 @@ static ggml_cann_device_info ggml_cann_init() {
ACL_CHECK(aclrtMemGetAllocationGranularity(
&prop, ACL_RT_MEM_ALLOC_GRANULARITY_RECOMMENDED,
&info.devices[id].vmm_granularity));
size_t free, total;
ggml_backend_cann_get_device_memory(id, &free, &total);
info.devices[id].total_vram = free;
}
// TODO: add more device info later.
@ -212,11 +206,6 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
* @return A pointer to the allocated buffer.
*/
void* alloc(size_t size, size_t* actual_size) override {
const size_t alignment = 128;
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
#ifdef DEBUG_CANN_MALLOC
int nnz = 0;
size_t max_size = 0;
@ -255,11 +244,13 @@ struct ggml_cann_pool_leg : public ggml_cann_pool {
return ptr;
}
void* ptr;
size_t look_ahead_size = (size_t)(1.05 * size);
look_ahead_size = 256 * ((look_ahead_size + 255) / 256);
ggml_cann_set_device(device);
ACL_CHECK(
aclrtMalloc(&ptr, size, ACL_MEM_MALLOC_HUGE_FIRST));
*actual_size = size;
pool_size += size;
aclrtMalloc(&ptr, look_ahead_size, ACL_MEM_MALLOC_HUGE_FIRST));
*actual_size = look_ahead_size;
pool_size += look_ahead_size;
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO(
"%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, "
@ -303,7 +294,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
/**
* @brief The maximum size of the virtual memory pool (32 GB).
*/
size_t max_size;
static const size_t CANN_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
/**
* @brief The device ID associated with this buffer pool.
@ -348,11 +339,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
*/
explicit ggml_cann_pool_vmm(int device)
: device(device),
granularity(ggml_cann_info().devices[device].vmm_granularity) {
auto dev = ggml_cann_info().devices[device];
granularity = dev.vmm_granularity;
max_size = dev.total_vram;
}
granularity(ggml_cann_info().devices[device].vmm_granularity) {}
/**
* @brief Destructor to free all buffers in the virtual memory pool.
@ -381,19 +368,17 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
// round up the allocation size to the alignment to ensure that all
// allocations are aligned for all data types
const size_t alignment = 128;
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
size = alignment * ((size + alignment - 1) / alignment);
size_t avail = pool_size - pool_used;
if (size > avail) {
// round up to the next multiple of the granularity
size_t reserve_size = size - avail;
reserve_size = GGML_PAD(reserve_size, granularity);
reserve_size =
granularity * ((reserve_size + granularity - 1) / granularity);
GGML_ASSERT(pool_size + reserve_size <= max_size);
GGML_ASSERT(pool_size + reserve_size <= CANN_POOL_VMM_MAX_SIZE);
// allocate more physical memory
aclrtPhysicalMemProp prop = {};
@ -409,7 +394,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
// reserve virtual address space (if not already reserved)
if (pool_addr == 0) {
ACL_CHECK(aclrtReserveMemAddress(
&pool_addr, max_size, 0, NULL, 1));
&pool_addr, CANN_POOL_VMM_MAX_SIZE, 0, NULL, 1));
}
// map at the end of the pool
@ -422,11 +407,10 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
// add to the pool
pool_size += reserve_size;
#ifdef DEBUG_CANN_MALLOC
GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (reserved %llu MB)\n",
device, (unsigned long long) (pool_size/1024/1024),
(unsigned long long) (reserve_size/1024/1024));
#endif
// GGML_LOG_INFO("cann pool[%d]: size increased to %llu MB (
// reserved %llu MB)\n",
// device, (unsigned long long) (pool_size/1024/1024),
// (unsigned long long) (reserve_size/1024/1024));
}
GGML_ASSERT(pool_addr != 0);
@ -471,6 +455,7 @@ struct ggml_cann_pool_vmm : public ggml_cann_pool {
*/
std::unique_ptr<ggml_cann_pool> ggml_backend_cann_context::new_pool_for_device(
int device) {
// return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_leg(device));
return std::unique_ptr<ggml_cann_pool>(new ggml_cann_pool_vmm(device));
}
@ -502,6 +487,23 @@ struct ggml_backend_cann_buffer_context {
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
};
/**
* @brief Retrieve the name associated with a CANN buffer.
*
* This function returns the name of a CANN buffer, which is stored in the
* context of the buffer.
*
* @param buffer The CANN buffer whose name is to be retrieved.
* @return A pointer to a C-string containing the name of the buffer.
*/
static const char* ggml_backend_cann_buffer_get_name(
ggml_backend_buffer_t buffer) {
return "CANN";
GGML_UNUSED(buffer);
}
/**
* @brief Check if a buffer is a CANN buffer.
*
@ -511,10 +513,9 @@ struct ggml_backend_cann_buffer_context {
* @param buffer The buffer to check.
* @return true if the buffer is a CANN buffer, false otherwise.
*/
static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
static bool ggml_backend_buffer_is_cann(
ggml_backend_buffer_t buffer) {
return ggml_backend_buft_is_cann(buffer->buft);
return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
}
/**
@ -850,6 +851,13 @@ static void ggml_backend_cann_buffer_set_tensor(
void *transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, data, transform_buffer);
#ifndef NDEBUG
void *check_buffer = malloc(size);
ggml_backend_cann_transform_back(tensor, transform_buffer,
check_buffer);
GGML_ASSERT(memcmp(data, check_buffer, size) == 0);
free(check_buffer);
#endif
ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size,
transform_buffer, size,
ACL_MEMCPY_HOST_TO_DEVICE));
@ -961,7 +969,8 @@ static void ggml_backend_cann_buffer_clear(
* This structure defines function pointers to operations that can be performed
* on a CANN buffer within the backend.
*/
static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
static ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
/* .get_name = */ ggml_backend_cann_buffer_get_name,
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
/* .get_base = */ ggml_backend_cann_buffer_get_base,
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
@ -995,10 +1004,9 @@ struct ggml_backend_cann_buffer_type_context {
*/
static const char* ggml_backend_cann_buffer_type_name(
ggml_backend_buffer_type_t buft) {
ggml_backend_cann_buffer_type_context* buft_ctx =
(ggml_backend_cann_buffer_type_context*)buft->context;
return "CANN";
return buft_ctx->name.c_str();
GGML_UNUSED(buft);
}
/**
@ -1097,25 +1105,19 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size(
GGML_UNUSED(buft);
}
static bool ggml_backend_cann_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
GGML_UNUSED(buft);
}
/**
* @brief Interface for managing CANN buffer types in the GGML backend.
*
* Provides function pointers for allocating, querying properties, and managing
* memory for CANN buffer types in the GGML backend.
*/
static const ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
static ggml_backend_buffer_type_i ggml_backend_cann_buffer_type_interface = {
/* .get_name = */ ggml_backend_cann_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cann_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cann_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cann_buffer_type_get_alloc_size,
/* .is_host = */ ggml_backend_cann_buffer_type_is_host,
/* .is_host = */ NULL,
};
/**
@ -1143,10 +1145,9 @@ ggml_backend_cann_buffer_type(int32_t device) {
static bool ggml_backend_cann_buffer_type_initialized = false;
if (!ggml_backend_cann_buffer_type_initialized) {
for (int32_t i = 0; i < ggml_cann_info().device_count; i++) {
for (int32_t i = 0; i < GGML_CANN_MAX_DEVICES; i++) {
ggml_backend_cann_buffer_types[i] = {
/* .iface = */ ggml_backend_cann_buffer_type_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), i),
/* .context = */
new ggml_backend_cann_buffer_type_context{
i, "CANN" + std::to_string(i)},
@ -1212,15 +1213,10 @@ static void * ggml_cann_host_malloc(size_t size) {
return nullptr;
}
const size_t alignment = 128;
size = GGML_PAD(size, alignment);
if (size == 0) {
size = alignment;
}
void * hostPtr = nullptr;
aclError err = aclrtMallocHost((void **) &hostPtr, size);
if (err != ACL_SUCCESS) {
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
size / 1024.0 / 1024.0, aclGetRecentErrMsg());
return nullptr;
@ -1245,6 +1241,7 @@ static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggm
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(hostPtr, size);
buffer->buft = buft;
buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free;
return buffer;
@ -1266,7 +1263,7 @@ ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
},
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), 0),
/* .device = */ nullptr,
/* .context = */ nullptr,
};
@ -1466,6 +1463,24 @@ static void ggml_backend_cann_free(ggml_backend_t backend) {
delete backend;
}
/**
* @brief Retrieves the default buffer type associated with the CANN backend.
*
* This function returns the buffer type specific to the device associated
* with the CANN backend. It is used to allocate buffers for computations
* performed by the backend.
*
* @param backend Pointer to the CANN backend structure.
* @return Pointer to the buffer type structure for the CANN backend.
*/
static ggml_backend_buffer_type_t
ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;
return ggml_backend_cann_buffer_type(cann_ctx->device);
}
/**
* @brief Sets tensor data asynchronously in the CANN backend.
*
@ -1495,6 +1510,13 @@ static void ggml_backend_cann_set_tensor_async(ggml_backend_t backend,
void *transform_buffer = malloc(size);
ggml_backend_cann_transform(tensor, data, transform_buffer);
#ifndef NDEBUG
void *check_buffer = malloc(size);
ggml_backend_cann_transform_back(tensor, transform_buffer,
check_buffer);
GGML_ASSERT(memcmp(data, check_buffer, size));
free(check_buffer);
#endif
ACL_CHECK(aclrtMemcpyAsync(
(char *)tensor->data + offset, size, transform_buffer, size,
ACL_MEMCPY_HOST_TO_DEVICE, cann_ctx->stream()));
@ -1669,7 +1691,7 @@ static enum ggml_status ggml_backend_cann_graph_compute(
* @return bool Returns true if the operation is supported by the backend,
* otherwise false.
*/
static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
const ggml_tensor* op) {
switch (op->op) {
case GGML_OP_UNARY:
@ -1687,14 +1709,12 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
}
case GGML_OP_MUL_MAT: {
switch (op->src[0]->type) {
case GGML_TYPE_Q8_0:
// Current groupsize should not be greater than k-1 in
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
if (op->src[0]->ne[0] <= QK8_0) {
return false;
}
case GGML_TYPE_F16:
case GGML_TYPE_F32:
case GGML_TYPE_Q8_0:
// TODO: fix me
// Current groupsize should not be greater than k-1 in
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
case GGML_TYPE_Q4_0:
return true;
default:
@ -1726,41 +1746,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
return false;
}
}
case GGML_OP_CONT: {
// TODO: support GGML_TYPE_BF16
switch (op->src[0]->type) {
case GGML_TYPE_F32:
case GGML_TYPE_F16:
return true;
default:
return false;
}
}
case GGML_OP_ROPE: {
// TODO: with ops-test v == 1
float * ext_factor = (float*)((int32_t*)op->op_params + 7);
// TODO: n_dims <= ne0
if (op->src[0]->ne[0] != op->op_params[1]) {
return false;
}
// TODO: ext_factor != 0
if (*ext_factor != 0) {
return false;
}
return true;
}
case GGML_OP_UPSCALE: {
// aclnnUpsampleNearest2dGetWorkspaceSize not support
// selfDimN[2]/outDimN[2] or selfDimC[3]/outDimC[3] not equal
if (op->src[0]->ne[2] * op->ne[3] != op->src[0]->ne[3] * op->ne[2]) {
return false;
}
return true;
}
case GGML_OP_IM2COL:
case GGML_OP_CONCAT:
case GGML_OP_DUP:
case GGML_OP_REPEAT:
case GGML_OP_CONCAT:
case GGML_OP_NONE:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
@ -1774,13 +1762,17 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
case GGML_OP_SCALE:
case GGML_OP_SQR:
case GGML_OP_CLAMP:
case GGML_OP_CONT:
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_IM2COL:
case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS:
case GGML_OP_ARGSORT:
case GGML_OP_ACC:
case GGML_OP_GROUP_NORM:
case GGML_OP_UPSCALE:
case GGML_OP_PAD:
case GGML_OP_ARANGE:
case GGML_OP_TIMESTEP_EMBEDDING:
@ -1790,7 +1782,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
return false;
}
GGML_UNUSED(dev);
GGML_UNUSED(backend);
}
/**
@ -1808,6 +1800,31 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
return buft->iface.get_name == ggml_backend_cann_buffer_type_name;
}
/**
* @brief Checks if the CANN backend supports a specific backend buffer type.
*
* This function determines whether the CANN backend supports the given backend
* buffer type by comparing the device context of the backend and buffer type.
* It returns true if the devices are same between the backend context and
* buffer type context.
*
* @param backend Pointer to the CANN backend.
* @param buft Pointer to the backend buffer type to check.
* @return bool Returns true if the CANN backend supports the buffer type,
* otherwise false.
*/
static bool ggml_backend_cann_supports_buft(
ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_cann(buft)) {
ggml_backend_cann_context * cann_ctx =
(ggml_backend_cann_context *)backend->context;
ggml_backend_cann_buffer_type_context * buft_ctx =
(ggml_backend_cann_buffer_type_context *)buft->context;
return buft_ctx->device == cann_ctx->device;
}
return false;
}
/**
* @brief Determines if a tensor operation should be offloaded to the CANN
* backend.
@ -1822,14 +1839,54 @@ static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft) {
* @return bool Returns true if the operation should be offloaded, otherwise
* false.
*/
static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
static bool ggml_backend_cann_offload_op(ggml_backend_t backend,
const ggml_tensor* op) {
const int min_batch_size = 32;
GGML_UNUSED(dev);
GGML_UNUSED(backend);
return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS;
}
/**
* @brief Creates a new event for the CANN backend.
*
* This function initializes a new event for the CANN backend by setting the
* device and creating an ACL runtime event. The created event is then wrapped
* in a ggml_backend_event structure and returned.
*
* @param backend Pointer to the CANN backend.
* @return ggml_backend_event_t Returns a pointer to the new event structure.
*/
static ggml_backend_event_t ggml_backend_cann_event_new(
ggml_backend_t backend) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;
ggml_cann_set_device(cann_ctx->device);
aclrtEvent event;
ACL_CHECK(aclrtCreateEvent(&event));
return new ggml_backend_event{
/* .backend = */ backend,
/* .context = */ event,
};
}
/**
* @brief Frees a CANN backend event.
*
* This function destroys the ACL runtime event associated with the given CANN
* backend event and then deletes the event structure itself.
*
* @param event Pointer to the event structure to be freed.
*/
static void ggml_backend_cann_event_free(ggml_backend_event_t event) {
ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
delete event;
}
/**
* @brief Records an event on the CANN backend stream.
*
@ -1838,9 +1895,10 @@ static bool ggml_backend_cann_offload_op(ggml_backend_dev_t dev,
*
* @param event Pointer to the event structure to be recorded.
*/
static void ggml_backend_cann_event_record(ggml_backend_t backend, ggml_backend_event_t event) {
static void ggml_backend_cann_event_record(ggml_backend_event_t event) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;
(ggml_backend_cann_context*)event->backend->context;
ACL_CHECK(aclrtRecordEvent((aclrtEvent)event->context, cann_ctx->stream()));
}
@ -1858,7 +1916,8 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
ggml_backend_event_t event) {
ggml_backend_cann_context* cann_ctx =
(ggml_backend_cann_context*)backend->context;
if (ggml_backend_is_cann(backend)) {
if (ggml_backend_is_cann(event->backend)) {
ACL_CHECK(aclrtStreamWaitEvent(cann_ctx->stream(),
(aclrtEvent)event->context));
} else {
@ -1866,6 +1925,17 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
}
}
/**
* @brief Synchronizes the given event on the CANN backend.
*
* This function waits for the specified event to complete on the ACL runtime.
*
* @param event Pointer to the event structure to be synchronized.
*/
static void ggml_backend_cann_event_synchronize(ggml_backend_event_t event) {
ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
}
/**
* @brief Structure defining the interface for the CANN backend.
*
@ -1873,9 +1943,10 @@ static void ggml_backend_cann_event_wait(ggml_backend_t backend,
* supported by the CANN backend, including name retrieval, memory
* management, tensor operations, synchronization, and event handling.
*/
static const ggml_backend_i ggml_backend_cann_interface = {
static ggml_backend_i ggml_backend_cann_interface = {
/* .get_name = */ ggml_backend_cann_name,
/* .free = */ ggml_backend_cann_free,
/* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
@ -1885,6 +1956,9 @@ static const ggml_backend_i ggml_backend_cann_interface = {
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cann_graph_compute,
/* .supports_op = */ ggml_backend_cann_supports_op,
/* .supports_buft = */ ggml_backend_cann_supports_buft,
/* .offload_op = */ ggml_backend_cann_offload_op,
/* .event_record = */ ggml_backend_cann_event_record,
/* .event_wait = */ ggml_backend_cann_event_wait,
};
@ -1903,235 +1977,6 @@ static ggml_guid_t ggml_backend_cann_guid() {
return &guid;
}
// backend device
struct ggml_backend_cann_device_context {
int device;
std::string name;
std::string description;
};
static const char * ggml_backend_cann_device_get_name(ggml_backend_dev_t dev) {
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
return ctx->name.c_str();
}
static const char* ggml_backend_cann_device_get_description(ggml_backend_dev_t dev) {
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
return ctx->description.c_str();
}
static void ggml_backend_cann_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
ggml_backend_cann_get_device_memory(ctx->device, free, total);
}
static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
return GGML_BACKEND_DEVICE_TYPE_GPU;
}
static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
props->name = ggml_backend_cann_device_get_name(dev);
props->description = ggml_backend_cann_device_get_description(dev);
props->type = ggml_backend_cann_device_get_type(dev);
ggml_backend_cann_device_get_memory(dev, &props->memory_free, &props->memory_total);
bool host_buffer = getenv("GGML_CANN_NO_PINNED") == nullptr;
props->caps = {
/* .async = */ false,
/* .host_buffer = */ host_buffer,
/* .buffer_from_host_ptr = */ false,
/* .events = */ true,
};
}
static ggml_backend_t ggml_backend_cann_device_init(ggml_backend_dev_t dev, const char * params) {
GGML_UNUSED(params);
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
return ggml_backend_cann_init(ctx->device);
}
/**
* @brief Checks if the CANN backend supports a specific backend buffer type.
*
* This function determines whether the CANN backend supports the given backend
* buffer type by comparing the device context of the backend and buffer type.
* It returns true if the devices are same between the backend context and
* buffer type context.
*
* @param backend Pointer to the CANN backend.
* @param buft Pointer to the backend buffer type to check.
* @return bool Returns true if the CANN backend supports the buffer type,
* otherwise false.
*/
static bool ggml_backend_cann_supports_buft(
ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
if (ggml_backend_buft_is_cann(buft)) {
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
ggml_backend_cann_buffer_type_context * buft_ctx =
(ggml_backend_cann_buffer_type_context *)buft->context;
return buft_ctx->device == dev_ctx->device;
}
return false;
}
static ggml_backend_buffer_type_t ggml_backend_cann_device_get_buffer_type(ggml_backend_dev_t dev) {
ggml_backend_cann_device_context * ctx = (ggml_backend_cann_device_context *)dev->context;
return ggml_backend_cann_buffer_type(ctx->device);
}
static ggml_backend_buffer_type_t ggml_backend_cann_device_get_host_buffer_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
return ggml_backend_cann_host_buffer_type();
}
/**
* @brief Creates a new event for the CANN backend device.
*
* This function initializes a new event for the CANN backend by setting the
* device and creating an ACL runtime event. The created event is then wrapped
* in a ggml_backend_event structure and returned.
*
* @param backend Pointer to the CANN backend.
* @return ggml_backend_event_t Returns a pointer to the new event structure.
*/
static ggml_backend_event_t ggml_backend_cann_device_event_new(
ggml_backend_dev_t dev) {
ggml_backend_cann_device_context * dev_ctx = (ggml_backend_cann_device_context *)dev->context;
ggml_cann_set_device(dev_ctx->device);
aclrtEvent event;
ACL_CHECK(aclrtCreateEvent(&event));
return new ggml_backend_event{
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), dev_ctx->device),
/* .context = */ event,
};
}
/**
* @brief Frees a CANN backend event.
*
* This function destroys the ACL runtime event associated with the given CANN
* backend event and then deletes the event structure itself.
*
* @param event Pointer to the event structure to be freed.
*/
static void ggml_backend_cann_device_event_free(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ACL_CHECK(aclrtDestroyEvent((aclrtEvent)event->context));
delete event;
GGML_UNUSED(dev);
}
/**
* @brief Synchronizes the given event on the CANN backend.
*
* This function waits for the specified event to complete on the ACL runtime.
*
* @param event Pointer to the event structure to be synchronized.
*/
static void ggml_backend_cann_device_event_synchronize(ggml_backend_dev_t dev, ggml_backend_event_t event) {
ACL_CHECK(aclrtSynchronizeEvent((aclrtEvent)event->context));
GGML_UNUSED(dev);
}
static const ggml_backend_device_i ggml_backend_cann_device_interface = {
/* .get_name = */ ggml_backend_cann_device_get_name,
/* .get_description = */ ggml_backend_cann_device_get_description,
/* .get_memory = */ ggml_backend_cann_device_get_memory,
/* .get_type = */ ggml_backend_cann_device_get_type,
/* .get_props = */ ggml_backend_cann_device_get_props,
/* .init_backend = */ ggml_backend_cann_device_init, // called for every card
/* .get_buffer_type = */ ggml_backend_cann_device_get_buffer_type,
/* .get_host_buffer_type = */ ggml_backend_cann_device_get_host_buffer_type,
/* .buffer_from_host_ptr = */ NULL, // not supported for CANN
/* .supports_op = */ ggml_backend_cann_supports_op,
/* .supports_buft = */ ggml_backend_cann_supports_buft,
/* .offload_op = */ ggml_backend_cann_offload_op,
/* .event_new = */ ggml_backend_cann_device_event_new,
/* .event_free = */ ggml_backend_cann_device_event_free,
/* .event_synchronize = */ ggml_backend_cann_device_event_synchronize,
};
// backend reg
struct ggml_backend_cann_reg_context {
std::vector<ggml_backend_dev_t> devices;
};
static const char * ggml_backend_cann_reg_get_name(ggml_backend_reg_t reg) {
GGML_UNUSED(reg);
return GGML_CANN_NAME;
}
static size_t ggml_backend_cann_reg_get_device_count(ggml_backend_reg_t reg) {
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
return ctx->devices.size();
}
static ggml_backend_dev_t ggml_backend_cann_reg_get_device(ggml_backend_reg_t reg, size_t index) {
ggml_backend_cann_reg_context * ctx = (ggml_backend_cann_reg_context *)reg->context;
GGML_ASSERT(index < ctx->devices.size());
return ctx->devices[index];
}
static void * ggml_backend_cann_reg_get_proc_address(ggml_backend_reg_t reg, const char * name) {
GGML_UNUSED(reg);
GGML_UNUSED(name);
// reserved for future use
return nullptr;
}
static const ggml_backend_reg_i ggml_backend_cann_reg_interface = {
/* .get_name = */ ggml_backend_cann_reg_get_name,
/* .get_device_count = */ ggml_backend_cann_reg_get_device_count,
/* .get_device_get = */ ggml_backend_cann_reg_get_device,
/* .get_proc_address = */ ggml_backend_cann_reg_get_proc_address,
};
// backend registry, called only once for cann backend
ggml_backend_reg_t ggml_backend_cann_reg() {
static ggml_backend_reg reg;
static bool initialized = false;
{
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
aclInit(nullptr);
ggml_backend_cann_reg_context * ctx = new ggml_backend_cann_reg_context;
for (int i = 0; i < ggml_cann_info().device_count; i++) {
ggml_backend_cann_device_context* dev_ctx = new ggml_backend_cann_device_context();
dev_ctx->description = aclrtGetSocName();
dev_ctx->device = i;
dev_ctx->name = GGML_CANN_NAME + std::to_string(i);
ggml_cann_set_device(i);
ggml_backend_dev_t dev = new ggml_backend_device {
/* .iface = */ ggml_backend_cann_device_interface,
/* .reg = */ &reg,
/* .context = */ dev_ctx
};
ctx->devices.push_back(dev);
}
reg = ggml_backend_reg {
/* .api_version = */ GGML_BACKEND_API_VERSION,
/* .iface = */ ggml_backend_cann_reg_interface,
/* .context = */ ctx
};
}
initialized = true;
}
return &reg;
}
ggml_backend_t ggml_backend_cann_init(int32_t device) {
aclInit(nullptr);
if (device < 0 || device >= ggml_backend_cann_get_device_count()) {
@ -2148,7 +1993,7 @@ ggml_backend_t ggml_backend_cann_init(int32_t device) {
ggml_backend_t cann_backend =
new ggml_backend{/* .guid = */ ggml_backend_cann_guid(),
/* .interface = */ ggml_backend_cann_interface,
/* .device = */ ggml_backend_reg_dev_get(ggml_backend_cann_reg(), device),
/* .device = */ nullptr,
/* .context = */ ctx};
return cann_backend;
@ -2175,5 +2020,3 @@ void ggml_backend_cann_get_device_memory(int32_t device, size_t* free,
ggml_cann_set_device(device);
ACL_CHECK(aclrtGetMemInfo(ACL_HBM_MEM, free, total));
}
GGML_BACKEND_DL_IMPL(ggml_backend_cann_reg)

View File

@ -1,76 +0,0 @@
if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOME})
set(CANN_INSTALL_DIR $ENV{ASCEND_TOOLKIT_HOME})
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
endif()
# Auto-detech Soc type and Soc version, if detect failed, will abort build
set(SOC_VERSION "")
function(detect_ascend_soc_type SOC_VERSION)
execute_process(
COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
OUTPUT_VARIABLE npu_info
RESULT_VARIABLE npu_result
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if("${npu_info}" STREQUAL "" OR ${npu_result})
message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
endif()
set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
endfunction()
if(NOT SOC_TYPE)
detect_ascend_soc_type(SOC_VERSION)
set(SOC_TYPE "${SOC_VERSION}")
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
endif()
string(TOLOWER ${SOC_TYPE} SOC_VERSION) # SOC_VERSION need lower
# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND_310P.
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
if (CANN_INSTALL_DIR)
# Only Support Linux.
if (NOT UNIX)
message(FATAL_ERROR "CANN: CANN toolkit supports unix but not ${CMAKE_SYSTEM_NAME}")
endif()
# Supported platforms: x86-64, arm64
if (CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64")
elseif (CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "amd64")
else()
message(FATAL_ERROR "CANN: CANN toolkit supports x86-64 and arm64 but not ${CMAKE_SYSTEM_PROCESSOR}")
endif()
# Set header and libs
set(CANN_INCLUDE_DIRS
${CANN_INSTALL_DIR}/include
${CANN_INSTALL_DIR}/include/aclnn
${CANN_INSTALL_DIR}/acllib/include
)
add_subdirectory(kernels)
list(APPEND CANN_LIBRARIES
ascendcl
nnopbase
opapi
acl_op_compiler
ascendc_kernels
)
file(GLOB GGML_SOURCES_CANN "*.cpp")
ggml_add_backend_library(ggml-cann ${GGML_SOURCES_CANN})
target_link_libraries(ggml-cann PRIVATE ${CANN_LIBRARIES})
target_include_directories(ggml-cann PRIVATE ${CANN_INCLUDE_DIRS})
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
else()
message(FATAL_ERROR "CANN: Can't find CANN_INSTALL_DIR, did you forget to source set_var.sh?")
endif()

View File

@ -22,14 +22,11 @@
#include "aclnn_ops.h"
#include <aclnnop/aclnn_addcdiv.h>
#include <aclnnop/aclnn_avgpool2d.h>
#include <aclnnop/aclnn_batch_matmul.h>
#include <aclnnop/aclnn_cast.h>
#include <aclnnop/aclnn_constant_pad_nd.h>
#include <aclnnop/aclnn_copy.h>
#include <aclnnop/aclnn_cos.h>
#include <aclnnop/aclnn_div.h>
#include <aclnnop/aclnn_exp.h>
#include <aclnnop/aclnn_fill_scalar.h>
#include <aclnnop/aclnn_group_norm.h>
@ -37,7 +34,6 @@
#include <aclnnop/aclnn_layer_norm.h>
#include <aclnnop/aclnn_matmul.h>
#include <aclnnop/aclnn_max_pool.h>
#include <aclnnop/aclnn_mm.h>
#include <aclnnop/aclnn_permute.h>
#include <aclnnop/aclnn_pow_tensor_tensor.h>
#include <aclnnop/aclnn_reduce_sum.h>
@ -57,7 +53,6 @@
#include <exception>
#include <vector>
#include "ggml-impl.h"
#include "kernels/ascendc_kernels.h"
#define GGML_COMMON_DECL_C
@ -246,14 +241,10 @@ void ggml_cann_concat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
aclTensor* acl_src1 = ggml_cann_create_tensor(src1);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
const int32_t dim = ggml_get_op_params_i32(dst, 0);
GGML_ASSERT(dim >= 0 && dim < 4);
int32_t acl_dim = 3 - dim;
int64_t concat_dim = 1;
aclTensor* tensors[] = {acl_src0, acl_src1};
aclTensorList* tensorList = aclCreateTensorList(tensors, 2);
aclnn_concat(ctx, tensorList, acl_dst, acl_dim);
aclnn_concat(ctx, tensorList, acl_dst, concat_dim);
ACL_CHECK(aclDestroyTensorList(tensorList));
ACL_CHECK(aclDestroyTensor(acl_dst));
@ -1105,9 +1096,9 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
}
/**
* @brief Creates an ACL tensor initialized with value using a provided buffer.
* @brief Creates an ACL tensor initialized with ones using a provided buffer.
*
* This function initializes a tensor with value using the specified buffer and
* This function initializes a tensor with ones using the specified buffer and
* tensor parameters.
*
* @param ctx The context for the CANN backend operations.
@ -1120,12 +1111,12 @@ static aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, void* buffer,
* @param type_size The size of each element in the tensor data type.
* @param value The value to be used for initializing the tensor (default
* is 1.0).
* @return An ACL tensor initialized with value.
* @return An ACL tensor initialized with ones.
*/
static aclTensor* aclnn_values(ggml_backend_cann_context& ctx, void* buffer,
size_t n_bytes, int64_t* ne, int64_t dims,
aclDataType type, size_t type_size,
float value = 1.0f) {
static aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, void* buffer,
size_t n_bytes, int64_t* ne, int64_t dims,
aclDataType type, size_t type_size,
float value = 1.0f) {
aclTensor* acl_tensor =
aclnn_zero(ctx, buffer, n_bytes, ne, dims, type, type_size);
float alpha_host = 1.0f;
@ -1167,7 +1158,7 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
size_t one_tensor_n_bytes = src->ne[0] * ggml_element_size(src);
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
aclTensor* acl_gamma = aclnn_values(
aclTensor* acl_gamma = aclnn_ones(
ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne, 1,
ggml_cann_type_mapping(src->type), ggml_element_size(src));
@ -1211,9 +1202,9 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
ggml_cann_pool_alloc one_tensor_allocator(ctx.pool(), one_tensor_n_bytes);
aclTensor* mask_tensor =
aclnn_values(ctx, one_tensor_allocator.get(), one_tensor_n_bytes,
src->ne, GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
ggml_element_size(src), value);
aclnn_ones(ctx, one_tensor_allocator.get(), one_tensor_n_bytes, src->ne,
GGML_MAX_DIMS, ggml_cann_type_mapping(src->type),
ggml_element_size(src), value);
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
@ -1446,6 +1437,10 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_tensor* src0 = dst->src[0]; // kernel
ggml_tensor* src1 = dst->src[1]; // input
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
GGML_TENSOR_BINARY_OP_LOCALS;
// aclnnIm2col only works on 2D. set s1, p1, d1 to 1 to perform 2D
@ -1467,6 +1462,9 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
const int64_t OH = is_2D ? ne2 : 1;
const int64_t OW = ne1;
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
GGML_ASSERT(nb10 == sizeof(float));
// memory allocated increased to 3x when is_2D == false
const int64_t n_bytes_factor = is_2D ? 1 : 3;
@ -1770,92 +1768,6 @@ static void aclnn_sin(ggml_backend_cann_context& ctx, aclTensor* acl_src,
ACL_CHECK(aclnnSin(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
/**
* @brief Performs element-wise division of tensor1 by tensor2 , multiplies the
result by the scalar value and adds it to self .
*
* Performs element-wise division of tensor1 by tensor2,
* multiplies the result by the scalar value and adds it to self .
* The operation is defined as:
* \f[
* \text{out}_i = \text{selft}_i + \text{value} \times
\frac{\text{tensor1}_i}{\text{tensor2}_i}
* \f]
* @param ctx The context for the CANN backend operations.
* @param acl_self The source tensor on which the addcdiv function will be
applied.
* @param tensor1 Numerator tensor.
* @param tensor2 Denominator tensor.
* @param value The value to be used for coefficient.
*/
static void aclnn_inplace_addcdiv(ggml_backend_cann_context& ctx,
aclTensor* acl_self, aclTensor* tensor1,
aclTensor* tensor2, float value) {
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
ACL_CHECK(aclnnInplaceAddcdivGetWorkspaceSize(
acl_self, tensor1, tensor2, acl_value, &workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(aclnnInplaceAddcdiv(workspaceAddr, workspaceSize, executor,
ctx.stream()));
}
/**
* @brief Matrix division, optionally in-place.
*
* This function division each element of the source tensor `acl_src` by the
* tensor `acl_other` and stores the result in the destination tensor `acl_dst`.
* If `inplace` is true, `acl_dst` will not be used and the operation is
* performed in-place on `acl_src`. The operation is defined as: \f[
* \text{dst}_i = \frac{\text{acl_src}_i}{\text{acl_other}_i}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_src Numerator tensor..
* @param acl_other Denominator tensor.
* @param acl_dst The destination tensor where the result will be stored if
* `inplace` is false.
* @param inplace Flag indicating whether to perform the operation in-place on
* `acl_src`.
*/
static void aclnn_div_tensor(ggml_backend_cann_context& ctx, aclTensor* acl_src,
aclTensor* acl_other, aclTensor* acl_dst,
bool inplace) {
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
if (inplace) {
ACL_CHECK(aclnnInplaceDivGetWorkspaceSize(acl_src, acl_other,
&workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(aclnnInplaceDiv(workspaceAddr, workspaceSize, executor,
ctx.stream()));
} else {
ACL_CHECK(aclnnDivGetWorkspaceSize(acl_src, acl_other, acl_dst,
&workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(
aclnnDiv(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
}
void ggml_cann_timestep_embedding(ggml_backend_cann_context& ctx,
ggml_tensor* dst) {
const ggml_tensor* src = dst->src[0];
@ -2399,16 +2311,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ctx.stream()));
switch (src0->type) {
case GGML_TYPE_F32: {
#ifdef ASCEND_310P
// Special operation for get_row_f32 kernel of 310P: clear the
// content of dest data buffer when row is not aligned to 32 bytes
if ((src0->ne[0] % 8) != 0) {
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
}
#endif
case GGML_TYPE_F32:
aclrtlaunch_ascendc_get_row_f32(
24, ctx.stream(), src0->data, src1->data, dst->data,
((ggml_tensor*)src0->extra)->ne,
@ -2417,19 +2320,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb);
break;
}
case GGML_TYPE_F16: {
#ifdef ASCEND_310P
// Special operation for get_row_f16 kernel of 310P: clear the
// content of dest data buffer when row is not aligned to 32 bytes
if ((src0->ne[0] % 16) != 0) {
size_t dst_len =
src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
ggml_type_size(
GGML_TYPE_F32); // out is also f32, even input is f16
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
}
#endif
case GGML_TYPE_F16:
aclrtlaunch_ascendc_get_row_f16(
24, ctx.stream(), src0->data, src1->data, dst->data,
((ggml_tensor*)src0->extra)->ne,
@ -2438,7 +2329,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb);
break;
}
case GGML_TYPE_Q4_0:
aclrtlaunch_ascendc_get_row_q4_0(
24, ctx.stream(), src0->data, src1->data, dst->data,
@ -2517,6 +2407,7 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclTensor* acl_weight, aclTensor* acl_dst) {
int8_t cube_math_type = 1; // ALLOW_FP32_DOWN_PRECISION, when input is
// fp32, atlas a2 will transpose it to HFLOAT32.
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
@ -2534,81 +2425,6 @@ static void aclnn_mat_mul(ggml_backend_cann_context& ctx, aclTensor* acl_input,
aclnnMatmul(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
/**
* @brief Performs matrix multiplication of two 2D tensors.
*
* This function computes the matrix multiplication of the input tensor
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
* destination tensor `acl_dst`.
* The operation is defined as:
* \f[
* \text {acl_dst}=\text {acl_input@acl_weight}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_input The input tensor for the matrix multiplication.
* @param acl_weight The weight tensor for the matrix multiplication.
* @param acl_dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void aclnn_mat_mul_2d(ggml_backend_cann_context& ctx,
aclTensor* acl_input, aclTensor* acl_weight,
aclTensor* acl_dst) {
int8_t cube_math_type = 2;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(aclnnMmGetWorkspaceSize(acl_input, acl_weight, acl_dst,
cube_math_type, &workspaceSize,
&executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(aclnnMm(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
/**
* @brief Performs matrix multiplication of two 3D tensors.
*
* This function computes the matrix multiplication of the input tensor
* `acl_input` and the weight tensor `acl_weight`, and stores the result in the
* destination tensor `acl_dst`.
* The operation is defined as:
* \f[
* \text {acl_dst}=\text {acl_input@acl_weight}
* \f]
*
* @param ctx The context for the CANN backend operations.
* @param acl_input The input tensor for the matrix multiplication.
* @param acl_weight The weight tensor for the matrix multiplication.
* @param acl_dst The destination tensor where the result of the matrix
* multiplication will be stored.
*/
static void aclnn_mat_mul_3d(ggml_backend_cann_context& ctx,
aclTensor* acl_input, aclTensor* acl_weight,
aclTensor* acl_dst) {
int8_t cube_math_type = 2;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
ACL_CHECK(aclnnBatchMatMulGetWorkspaceSize(acl_input, acl_weight, acl_dst,
cube_math_type, &workspaceSize,
&executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(
aclnnBatchMatMul(workspaceAddr, workspaceSize, executor, ctx.stream()));
}
/**
* @brief Performs matrix multiplication with floating-point precision on
* tensors using the CANN backend.
@ -2630,39 +2446,20 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
// broadcast, when weight ne2 or ne3 is not 1, weight need repeat.
BCAST_MUL_MAT_SHAPE(input, weight, dst);
int64_t n_dims = bcast_dims;
if (bcast_input_ne[3] == bcast_weight_ne[3] && bcast_input_ne[3] == 1) {
if (bcast_input_ne[2] == 1 && bcast_weight_ne[2] == 1) {
n_dims = 2;
} else if (bcast_input_ne[2] == 1) {
n_dims = 3;
}
}
aclTensor* acl_input_tensor =
ggml_cann_create_tensor(input, bcast_input_ne, bcast_input_nb, n_dims);
// transpose weight: [1,2,3,4] -> [1,2,4,3]
int64_t transpose_ne[] = {bcast_weight_ne[1], bcast_weight_ne[0],
bcast_weight_ne[2], bcast_weight_ne[3],
bcast_weight_ne[4], bcast_weight_ne[5]};
size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0],
bcast_weight_nb[2], bcast_weight_nb[3],
bcast_weight_nb[4], bcast_weight_nb[5]};
aclTensor* acl_weight_tensor =
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims);
aclTensor* acl_dst =
ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims);
switch (n_dims) {
case 2:
aclnn_mat_mul_2d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
case 3:
aclnn_mat_mul_3d(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
default:
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
break;
}
aclTensor* acl_weight_tensor =
ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, bcast_dims);
aclTensor* acl_input_tensor =
ggml_cann_create_tensor(input, BCAST_MUL_MAT_PARAM(input));
aclTensor* acl_dst = ggml_cann_create_tensor(dst, BCAST_MUL_MAT_PARAM(dst));
aclnn_mat_mul(ctx, acl_input_tensor, acl_weight_tensor, acl_dst);
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
@ -2683,47 +2480,51 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
* multiplication will be stored.
*/
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
ggml_tensor* dst,
const enum ggml_type type) {
ggml_tensor* dst,
const enum ggml_type type) {
ggml_tensor* src0 = dst->src[0]; // weight
ggml_tensor* src1 = dst->src[1]; // input
// The shape of the weight is NCHW.
// Matrix multiplication uses HW dims.
// HC is regarded as batch.
// weight need transpose.
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
// is regarded as batch. weight need transpose.
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
float weight_elem_size;
if (type == GGML_TYPE_Q4_0) {
weight_elem_size = float(sizeof(uint8_t)) / 2;
} else if (type == GGML_TYPE_Q8_0) {
}
else if (type == GGML_TYPE_Q8_0) {
weight_elem_size = float(sizeof(uint8_t));
} else {
}
else {
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
}
float weight_nb[] = {src0->ne[0] * weight_elem_size, weight_elem_size};
size_t weight_stride = src0->ne[1] * src0->ne[0] * weight_elem_size;
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
// size of one matrix is element_size * height * width.
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
// scale stored at the end of weight. Also need transpose.
GGML_ASSERT(QK4_0 == QK8_0);
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
size_t scale_elem_size = sizeof(uint16_t);
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
scale_elem_size};
size_t scale_stride = src0->ne[1] * src0->ne[0] / QK8_0 * scale_elem_size;
size_t scale_stride = scale_elem_size * src0->ne[0] * src0->ne[1] / QK8_0;
char* scale_offset = (char*)src0->data + weight_size;
// input
void* input_buffer;
size_t input_elem_size = sizeof(uint16_t);
int64_t input_ne[] = {src1->ne[0], src1->ne[1]};
size_t input_nb[] = {input_elem_size, input_ne[0] * input_elem_size};
size_t input_stride = input_ne[0] * input_ne[1] * input_elem_size;
ggml_cann_pool_alloc input_alloctor(ctx.pool());
void* input_buffer = src1->data;
size_t input_nb[] = {input_elem_size, input_elem_size * src1->ne[0]};
size_t input_stride = input_elem_size * src1->ne[0] * src1->ne[1];
// case in
ggml_cann_pool_alloc input_alloctor(ctx.pool());
if (src1->type != GGML_TYPE_F16) {
aclTensor* acl_src1_tensor = ggml_cann_create_tensor(src1);
input_buffer =
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
input_alloctor.alloc(ggml_nelements(src1) * input_elem_size);
input_buffer = input_alloctor.get();
int64_t* input_cast_ne = src1->ne;
size_t input_cast_nb[GGML_MAX_DIMS];
@ -2736,136 +2537,85 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
input_buffer, ACL_FLOAT16, input_elem_size, input_cast_ne,
input_cast_nb, GGML_MAX_DIMS);
aclnn_cast(ctx, acl_src1_tensor, acl_input_tensor, ACL_FLOAT16);
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
ACL_CHECK(aclDestroyTensor(acl_src1_tensor));
} else {
input_buffer = src1->data;
}
// output
size_t output_elem_size = sizeof(uint16_t);
size_t output_nb[] = {output_elem_size, dst->ne[0] * output_elem_size};
ggml_cann_pool_alloc output_allocator(ctx.pool());
void* output_buffer =
output_allocator.alloc(ggml_nelements(dst) * output_elem_size);
size_t output_stride = dst->ne[0] * dst->ne[1] * output_elem_size;
int64_t output_ne[] = {dst->ne[0], dst->ne[1]};
size_t output_nb[] = {output_elem_size, output_elem_size * dst->ne[0]};
ggml_cann_pool_alloc output_alloctor(
ctx.pool(), ggml_nelements(dst) * output_elem_size);
void* output_buffer = output_alloctor.get();
size_t output_stride = output_elem_size * dst->ne[0] * dst->ne[1];
// aclnn
int64_t max_elem_size = 65535;
int64_t split_size = (src0->ne[1] / max_elem_size) + 1;
ggml_cann_pool_alloc workspace_allocator(ctx.pool());
aclOpExecutor* executor = nullptr;
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
for (int64_t n1 = 0; n1 < src1->ne[3]; n1++) {
for (int64_t c1 = 0; c1 < src1->ne[2]; c1++) {
int64_t n0 = n1 / (src1->ne[3] / src0->ne[3]);
int64_t c0 = c1 / (src1->ne[2] / src0->ne[2]);
int64_t batch1 = (n1 * src1->ne[2]) + c1;
int64_t batch0 = (n0 * src0->ne[2]) + c0;
int64_t batch1 = n1 * src1->ne[2] + c1;
int64_t batch0 = n0 * src0->ne[2] + c0;
aclTensor* acl_input_tensor = ggml_cann_create_tensor(
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
input_elem_size, input_ne, input_nb, 2);
// first split
int64_t weight_ne_offset = 0;
int64_t weight_ne[2] = {
max_elem_size > src0->ne[1] ? src0->ne[1] : max_elem_size,
src0->ne[0]};
int64_t scale_ne_offset = 0;
int64_t scale_ne[2] = {weight_ne[0], weight_ne[1] / QK8_0};
int64_t output_ne_offset = 0;
int64_t output_ne[2] = {weight_ne[0], dst->ne[1]};
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
weight_nb, 2);
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
scale_ne_offset);
scale_elem_size, scale_ne, scale_nb, 2);
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
output_ne_offset);
output_elem_size, output_ne, output_nb, 2);
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
&workspaceSize, &executor));
if (workspaceAddr == nullptr) {
workspaceAddr = workspace_allocator.alloc(workspaceSize);
if (workspaceSize > 0 && workspaceAddr == nullptr) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
workspaceAddr, workspaceSize, executor, ctx.stream()));
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
// other splits
for (int64_t split = 1; split < split_size; split++) {
weight_ne_offset +=
weight_elem_size * weight_ne[0] * weight_ne[1];
weight_ne[0] = max_elem_size * (split + 1) > src0->ne[1]
? src0->ne[1] - (max_elem_size * split)
: max_elem_size;
scale_ne_offset += scale_elem_size * scale_ne[0] * scale_ne[1];
scale_ne[0] = weight_ne[0];
output_ne_offset +=
output_elem_size * output_ne[0] * output_ne[1];
output_ne[0] = weight_ne[0];
acl_weight_tensor = ggml_cann_create_tensor(
(char*)src0->data + batch0 * weight_stride,
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
weight_nb, 2, ACL_FORMAT_ND, weight_ne_offset);
acl_scale_tensor = ggml_cann_create_tensor(
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
scale_elem_size, scale_ne, scale_nb, 2, ACL_FORMAT_ND,
scale_ne_offset);
acl_output_tensor = ggml_cann_create_tensor(
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
output_ne_offset);
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
nullptr, nullptr, nullptr, nullptr, QK8_0,
acl_output_tensor, &workspaceSize, &executor));
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
workspaceAddr, workspaceSize, executor, ctx.stream()));
ACL_CHECK(aclDestroyTensor(acl_weight_tensor));
ACL_CHECK(aclDestroyTensor(acl_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
}
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
}
}
// cast out
if (dst->type != GGML_TYPE_F16) {
int64_t* output_cast_ne = dst->ne;
size_t output_cast_nb[GGML_MAX_DIMS];
output_cast_nb[0] = sizeof(uint16_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
}
aclTensor* acl_output_tensor = ggml_cann_create_tensor(
output_buffer, ACL_FLOAT16, output_elem_size, output_cast_ne,
output_cast_nb, GGML_MAX_DIMS);
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor,
ggml_cann_type_mapping(dst->type));
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
int64_t* output_cast_ne = dst->ne;
size_t output_cast_nb[GGML_MAX_DIMS];
output_cast_nb[0] = sizeof(uint16_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
output_cast_nb[i] = output_cast_nb[i - 1] * output_cast_ne[i - 1];
}
aclTensor* acl_output_tensor =
ggml_cann_create_tensor(output_buffer, ACL_FLOAT16, output_elem_size,
output_cast_ne, output_cast_nb, GGML_MAX_DIMS);
aclTensor* acl_dst_tensor = ggml_cann_create_tensor(dst);
aclnn_cast(ctx, acl_output_tensor, acl_dst_tensor, ACL_FLOAT);
ACL_CHECK(aclDestroyTensor(acl_output_tensor));
ACL_CHECK(aclDestroyTensor(acl_dst_tensor));
}
void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
@ -2964,14 +2714,12 @@ static void aclnn_index_fill_tensor(ggml_backend_cann_context& ctx,
static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
aclTensor* acl_cos_repeat_tensor,
aclTensor* acl_sin_repeat_tensor,
float theta_scale, float freq_scale,
float attn_factor, bool is_neox) {
float theta_scale, bool is_neox) {
// int sin/cos cache, cache has different repeat method depond on
// @param.is_neox
ggml_tensor* src0 = dst->src[0]; // input
ggml_tensor* src1 = dst->src[1]; // position
ggml_tensor* src2 = dst->src[2]; // freq_factors
// arange, [0,1,...,ne0/2]
int64_t arange_length = src0->ne[0] / 2;
@ -3000,26 +2748,11 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
ggml_cann_pool_alloc theta_scale_allocator(ctx.pool(),
arange_length * sizeof(float_t));
void* theta_scale_buffer = theta_scale_allocator.get();
aclTensor* acl_theta_scale_tensor = aclnn_values(
aclTensor* acl_theta_scale_tensor = aclnn_ones(
ctx, theta_scale_buffer, arange_length * sizeof(float_t), arange_ne,
GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), theta_scale);
aclnn_pow_tensor_tensor(ctx, acl_theta_scale_tensor, acl_arange_tensor);
// freq_scale
if (freq_scale != 1) {
aclnn_muls(ctx, acl_theta_scale_tensor, freq_scale, nullptr, true);
}
// freq_factors
if (src2) {
aclTensor* acl_freq_factors_tensor = ggml_cann_create_tensor(
src2->data, ggml_cann_type_mapping(src2->type),
ggml_type_size(src2->type), arange_ne, arange_nb, GGML_MAX_DIMS);
aclnn_div_tensor(ctx, acl_theta_scale_tensor, acl_freq_factors_tensor,
nullptr, true);
ACL_CHECK(aclDestroyTensor(acl_freq_factors_tensor));
}
// position
GGML_ASSERT(src1->type == GGML_TYPE_I32);
int64_t position_length = src1->ne[0];
@ -3083,12 +2816,6 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
GGML_MAX_DIMS, ACL_FORMAT_ND);
aclnn_cos(ctx, acl_permute_tensor, acl_cos_tensor);
// attn_factor
if (attn_factor != 1) {
aclnn_muls(ctx, acl_sin_tensor, attn_factor, nullptr, true);
aclnn_muls(ctx, acl_cos_tensor, attn_factor, nullptr, true);
}
// repeat
if (is_neox) {
int64_t repeatsArray[] = {1, 1, 1, 2};
@ -3114,27 +2841,15 @@ static void aclnn_cache_init(ggml_backend_cann_context& ctx, ggml_tensor* dst,
ACL_CHECK(aclDestroyTensor(acl_cos_tensor));
}
#ifdef __cplusplus
extern "C" {
#endif
aclnnStatus aclnnRotaryPositionEmbeddingGetWorkspaceSize(
const aclTensor* x, const aclTensor* cos, const aclTensor* sin,
int64_t mode, const aclTensor* yOut, uint64_t* workspaceSize,
aclOpExecutor** executor);
aclnnStatus aclnnRotaryPositionEmbedding(void* workspace,
uint64_t workspaceSize,
aclOpExecutor* executor,
aclrtStream stream);
#ifdef __cplusplus
}
#endif
void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
// TODO: use ascendc
// Only test with LLAMA model.
ggml_tensor* src0 = dst->src[0]; // input
ggml_tensor* src2 = dst->src[2]; // freq_factors
// TODO: with freq_factors
GGML_ASSERT(src2 == NULL);
// param
float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow;
// const int n_past = ((int32_t *) dst->op_params)[0];
@ -3152,11 +2867,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
memcpy(&beta_fast, (int32_t*)dst->op_params + 9, sizeof(float));
memcpy(&beta_slow, (int32_t*)dst->op_params + 10, sizeof(float));
// TODO: n_dims <= ne0
GGML_ASSERT(n_dims == ne0);
GGML_ASSERT(n_dims <= ne0);
GGML_ASSERT(n_dims % 2 == 0);
// TODO: ext_factor != 0
GGML_ASSERT(ext_factor == 0);
// TODO: freq_scale != 1
GGML_ASSERT(freq_scale == 1);
const float theta_scale = powf(freq_base, -2.0f / n_dims);
@ -3187,13 +2904,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ggml_cann_create_tensor(cos_buffer, ACL_FLOAT, sizeof(float_t),
sin_reshape_ne, sin_reshape_nb, GGML_MAX_DIMS);
aclnn_cache_init(ctx, dst, acl_cos_reshape_tensor, acl_sin_reshape_tensor,
theta_scale, freq_scale, attn_factor, is_neox);
aclTensor* acl_src = ggml_cann_create_tensor(src0);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
#ifdef ASCEND_310P
// Special ROPE operation for 310P
theta_scale, is_neox);
// roll input
void* input_roll_buffer;
@ -3236,7 +2947,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
}
acl_minus_one_tensor = aclnn_values(
acl_minus_one_tensor = aclnn_ones(
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
int64_t dim = 3;
@ -3263,15 +2974,17 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclDestroyTensor(acl_input_roll_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_tensor));
// init [-1, -1, -1, 1, 11...]
minus_one_scale_buffer = minus_one_scale_allocator.get();
int64_t minus_one_ne[4] = {src0->ne[0], 1, 1, 1};
size_t minus_one_nb[GGML_MAX_DIMS];
minus_one_nb[0] = sizeof(float_t);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
minus_one_nb[i] = minus_one_nb[i - 1] * minus_one_ne[i - 1];
}
acl_minus_one_tensor = aclnn_values(
acl_minus_one_tensor = aclnn_ones(
ctx, minus_one_scale_buffer, sizeof(float_t) * src0->ne[0],
minus_one_ne, GGML_MAX_DIMS, ACL_FLOAT, sizeof(float_t), 1);
// -1 * first half
@ -3313,12 +3026,14 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
acl_input_roll_mul_scale_tensor);
// output
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
void* output_fp32_buffer;
if (src0->type == GGML_TYPE_F32) {
aclnn_inplace_mul(ctx, acl_src, acl_cos_reshape_tensor);
aclnn_inplace_mul(ctx, acl_src0, acl_cos_reshape_tensor);
aclnn_inplace_mul(ctx, acl_input_roll_mul_scale_tensor,
acl_sin_reshape_tensor);
aclnn_add(ctx, acl_src, acl_input_roll_mul_scale_tensor, acl_dst);
aclnn_add(ctx, acl_src0, acl_input_roll_mul_scale_tensor, acl_dst);
// TODO: ne0 != n_dims in mode2
} else if (src0->type == GGML_TYPE_F16) {
size_t input_fp32_nb[GGML_MAX_DIMS];
@ -3345,7 +3060,7 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
aclTensor* output_fp32_tensor = ggml_cann_create_tensor(
output_fp32_buffer, ACL_FLOAT, sizeof(float_t), dst->ne,
input_fp32_nb, GGML_MAX_DIMS);
aclnn_mul(ctx, acl_src, acl_cos_reshape_tensor, input_fp32_tensor1);
aclnn_mul(ctx, acl_src0, acl_cos_reshape_tensor, input_fp32_tensor1);
aclnn_mul(ctx, acl_input_roll_mul_scale_tensor, acl_sin_reshape_tensor,
input_fp32_tensor2);
aclnn_add(ctx, input_fp32_tensor1, input_fp32_tensor2,
@ -3355,73 +3070,13 @@ void ggml_cann_rope(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
ACL_CHECK(aclDestroyTensor(input_fp32_tensor1));
ACL_CHECK(aclDestroyTensor(input_fp32_tensor2));
ACL_CHECK(aclDestroyTensor(output_fp32_tensor));
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_src));
}
return;
#endif
// src0 == GGML_TYPE_F16
// TODO: optimization this `if` code
if (src0->type == GGML_TYPE_F16) {
ggml_cann_pool_alloc sin_final_allocator(
ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
ggml_cann_pool_alloc cos_final_allocator(
ctx.pool(), src0->ne[0] * src0->ne[2] * ggml_type_size(src0->type));
void* sin_final_buffer = sin_final_allocator.get();
void* cos_final_buffer = cos_final_allocator.get();
int64_t sin_final_ne[4] = {src0->ne[0], 1, src0->ne[2], 1};
size_t sin_final_nb[GGML_MAX_DIMS];
sin_final_nb[0] = ggml_type_size(src0->type);
for (int i = 1; i < GGML_MAX_DIMS; i++) {
sin_final_nb[i] = sin_final_nb[i - 1] * sin_final_ne[i - 1];
}
aclTensor* acl_sin_final_tensor = ggml_cann_create_tensor(
sin_final_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
GGML_MAX_DIMS);
aclTensor* acl_cos_final_tensor = ggml_cann_create_tensor(
cos_final_buffer, ggml_cann_type_mapping(src0->type),
ggml_type_size(src0->type), sin_final_ne, sin_final_nb,
GGML_MAX_DIMS);
aclnn_cast(ctx, acl_sin_reshape_tensor, acl_sin_final_tensor,
ggml_cann_type_mapping(src0->type));
aclnn_cast(ctx, acl_cos_reshape_tensor, acl_cos_final_tensor,
ggml_cann_type_mapping(src0->type));
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
acl_sin_reshape_tensor = acl_sin_final_tensor;
acl_cos_reshape_tensor = acl_cos_final_tensor;
}
uint64_t workspaceSize = 0;
aclOpExecutor* executor;
void* workspaceAddr = nullptr;
int acl_mode = mode;
if (mode == 0) {
acl_mode = 1;
}
ACL_CHECK(aclnnRotaryPositionEmbeddingGetWorkspaceSize(
acl_src, acl_cos_reshape_tensor, acl_sin_reshape_tensor, acl_mode,
acl_dst, &workspaceSize, &executor));
if (workspaceSize > 0) {
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
workspaceAddr = workspace_allocator.get();
}
ACL_CHECK(aclnnRotaryPositionEmbedding(workspaceAddr, workspaceSize,
executor, ctx.stream()));
ACL_CHECK(aclDestroyTensor(acl_src));
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_sin_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_cos_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_minus_one_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_roll_mul_scale_tensor));
ACL_CHECK(aclDestroyTensor(acl_input_roll_reshape_tensor));
ACL_CHECK(aclDestroyTensor(acl_src0));
ACL_CHECK(aclDestroyTensor(acl_dst));
}

View File

@ -211,20 +211,17 @@ struct ggml_cann_pool_alloc {
struct ggml_backend_cann_context {
int32_t device; /**< Device ID. */
std::string name; /**< Name of the device. */
std::string description; /**< Description of the device. */
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {
{nullptr}}; /**< Array of streams for the device. */
/**
* @brief Constructor for initializing the context with a given device.
* @param device Device ID.
*/
explicit ggml_backend_cann_context(int device)
: device(device), name("CANN" + std::to_string(device)) {
ggml_cann_set_device(device);
description = aclrtGetSocName();
}
: device(device), name("CANN" + std::to_string(device)) {}
/**
* @brief Destructor for cleaning up resources.

View File

@ -1,3 +1,7 @@
if (NOT SOC_TYPE)
set (SOC_TYPE "Ascend910B3")
endif()
file(GLOB SRC_FILES
get_row_f32.cpp
get_row_f16.cpp
@ -9,6 +13,7 @@ file(GLOB SRC_FILES
dup.cpp
)
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
@ -25,6 +30,4 @@ ascendc_library(ascendc_kernels STATIC
${SRC_FILES}
)
message(STATUS "CANN: compile ascend kernels witch SOC_TYPE:${SOC_TYPE}, SOC_VERSION:${SOC_VERSION}, compile macro:-D${SOC_TYPE_COMPILE_OPTION}.")
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

View File

@ -5,7 +5,6 @@
using namespace AscendC;
#define BUFFER_NUM 2
const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
template <typename SRC_T, typename DST_T>
class DupByRows {
@ -52,36 +51,24 @@ class DupByRows {
__aicore__ inline void copy_in() {
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
const size_t elem_per_block = 32 / sizeof(SRC_T);
size_t tail = num_elem % elem_per_block;
size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
DataCopy(src_local, src_gm, cpy_elements_len);
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
DataCopyPadExtParams<SRC_T> padParams;
DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
src_queue.EnQue(src_local);
}
__aicore__ inline void copy_out() {
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
#ifdef ASCEND_310P
const size_t elem_per_block = 32 / sizeof(DST_T);
size_t tail = num_elem % elem_per_block;
size_t len = num_elem & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(dst_gm, dst_local, len);
}
if(tail != 0) {
for (size_t i = tail; i < elem_per_block; i++) {
dst_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
SetAtomicNone();
}
#else
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
DataCopyPad(dst_gm, dst_local, dataCopyParams);
#endif
dst_queue.FreeTensor(dst_local);
}

View File

@ -14,7 +14,7 @@ class GET_ROW_F16 {
int64_t *output_ne_ub, size_t *output_nb_ub) {
// TODO, use template for F16/f32
int64_t op_block_num = GetBlockNum();
op_block_idx = GetBlockIdx();
int64_t op_block_idx = GetBlockIdx();
for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i];
@ -59,42 +59,32 @@ class GET_ROW_F16 {
}
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
size_t origin_len = len;
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
const size_t elem_per_block = 32 / sizeof(half);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if(tail != 0) {
len += elem_per_block;
}
size_t tail = len % 32;
len = len & ~31;
DataCopy(input_local, input_gm[offset], len);
if(tail != 0) {
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(half);
DataCopyPadExtParams<half> padParams;
DataCopyPad(input_local[len], input_gm[offset + len],
dataCopyParams, padParams);
}
input_queue.EnQue(input_local);
}
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(output_gm[offset], output_local, len);
}
size_t tail = len % 32;
len = len & ~31;
DataCopy(output_gm[offset], output_local, len);
if(tail != 0) {
#ifdef ASCEND_310P
for (size_t i = tail; i < elem_per_block; i++) {
output_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
SetAtomicNone();
#else
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams);
#endif
}
output_queue.FreeTensor(output_local);
}
@ -160,7 +150,6 @@ class GET_ROW_F16 {
GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
int64_t op_block_idx;
};
template <typename T>

View File

@ -13,7 +13,7 @@ class GET_ROW_F32 {
int64_t *indices_ne_ub, size_t *indices_nb_ub,
int64_t *output_ne_ub, size_t *output_nb_ub) {
int64_t op_block_num = GetBlockNum();
op_block_idx = GetBlockIdx();
int64_t op_block_idx = GetBlockIdx();
for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i];
@ -55,40 +55,31 @@ class GET_ROW_F32 {
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if(tail != 0) {
len += elem_per_block;
}
size_t tail = len % 32;
len = len & ~31;
DataCopy(input_local, input_gm[offset], len);
if(tail != 0) {
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPadExtParams<float> padParams;
DataCopyPad(input_local[len], input_gm[offset + len],
dataCopyParams, padParams);
}
input_queue.EnQue(input_local);
}
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(output_gm[offset], output_local, len);
}
size_t tail = len % 32;
len = len & ~31;
DataCopy(output_gm[offset], output_local, len);
if(tail != 0) {
#ifdef ASCEND_310P
for (size_t i = tail; i < elem_per_block; i++) {
output_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
SetAtomicNone();
#else
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams);
#endif
}
output_queue.FreeTensor(output_local);
}
@ -153,7 +144,6 @@ class GET_ROW_F32 {
GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
int64_t op_block_idx;
};
template <typename T>

View File

@ -2,15 +2,6 @@
// optimize me. Use template to avoid copy code.
using namespace AscendC;
#ifdef ASCEND_310P // 310P not support 4bit get row
extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
GM_ADDR input_gm, GM_ADDR indices_gm, GM_ADDR output_gm,
GM_ADDR input_ne_gm, GM_ADDR indices_ne_gm, GM_ADDR indices_nb_gm,
GM_ADDR output_ne_gm, GM_ADDR output_nb_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support 4bit get row.\n");
}
#else
#define BUFFER_NUM 2
@ -200,5 +191,3 @@ extern "C" __global__ __aicore__ void ascendc_get_row_q4_0(
indices_nb_ub, output_ne_ub, output_nb_ub);
op.calculate();
}
#endif // #ifdef ASCEND_310P

View File

@ -1,14 +1,6 @@
#include "kernel_operator.h"
using namespace AscendC;
#ifdef ASCEND_310P
extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support f16->8bit quantization.\n");
}
#else
#define BUFFER_NUM 2
#define QK8_0 32
@ -214,5 +206,3 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f16_q8_0(
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate();
}
#endif // #ifdef ASCEND_310P

View File

@ -1,14 +1,6 @@
#include "kernel_operator.h"
using namespace AscendC;
#ifdef ASCEND_310P // 310P not support f32->8bit quantization
extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support f32->8bit quantization.\n");
}
#else
#define BUFFER_NUM 2
#define QK8_0 32
@ -212,5 +204,3 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_q8_0(
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate();
}
#endif // #ifdef ASCEND_310P

View File

@ -1,21 +1,6 @@
#include "kernel_operator.h"
using namespace AscendC;
#ifdef ASCEND_310P // 310P not support float->4bit quantization
extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support f32->4bit quantization.\n");
}
extern "C" __global__ __aicore__ void ascendc_quantize_f16_to_q4_0(
GM_ADDR input_gm, GM_ADDR output_gm, GM_ADDR input_ne_gm,
GM_ADDR input_nb_gm, GM_ADDR output_ne_gm) {
// let following test cases can continue run, here just print error information. Of Cource the test case that call this operator is failed.
printf("Ascend310P not support f16->4bit quantization.\n");
}
#else
#define BUFFER_NUM 2
#define Group_Size 32
@ -291,5 +276,3 @@ extern "C" __global__ __aicore__ void ascendc_quantize_f32_to_q4_0(
op.init(input_gm, output_gm, input_ne_ub, input_nb_ub, output_ne_ub);
op.calculate();
}
#endif // #ifdef ASCEND_310P

View File

@ -418,12 +418,6 @@ typedef struct {
} block_iq4_xs;
static_assert(sizeof(block_iq4_xs) == sizeof(ggml_half) + sizeof(uint16_t) + QK_K/64 + QK_K/2, "wrong iq4_xs block size/padding");
typedef struct {
ggml_half d[4]; // deltas for 4 iq4_nl blocks
uint8_t qs[QK4_NL * 2];// nibbles / quants for 4 iq4_nl blocks
} block_iq4_nlx4;
static_assert(sizeof(block_iq4_nlx4) == 4 * sizeof(ggml_half) + QK4_NL * 2, "wrong iq4_nlx4 block size/padding");
#endif // GGML_COMMON_DECL
#endif // GGML_COMMON_DECL

Some files were not shown because too many files have changed in this diff Show More