mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-03 07:50:28 +02:00
Compare commits
37 Commits
Author | SHA1 | Date | |
---|---|---|---|
0b9af32a8b | |||
11b1b63b14 | |||
0e26a6c92e | |||
66d8f0b7f1 | |||
ba5bcde874 | |||
ab0a8593c5 | |||
668ffc9b23 | |||
9962371f71 | |||
993acb5d41 | |||
a3d0aa73d1 | |||
14c57952f7 | |||
6c369d6788 | |||
4cdd9aad9b | |||
f38c057503 | |||
1e5544b39b | |||
d5673af79f | |||
a28dacec65 | |||
dbe29d4e33 | |||
fe3a67c546 | |||
b138ff2be3 | |||
cf6f1e4181 | |||
620a223814 | |||
f39f9690ec | |||
f9ca90256b | |||
2623640cd6 | |||
d87de61ae6 | |||
f5f485f899 | |||
e77b27c331 | |||
a5cc3dc8a2 | |||
37a709f655 | |||
3a5302108d | |||
d2ee117a0a | |||
db8ccdb850 | |||
d2419030b0 | |||
8986690c2a | |||
9286d3f584 | |||
940de9dbe9 |
38
.devops/main-cuda.Dockerfile
Normal file
38
.devops/main-cuda.Dockerfile
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
ARG UBUNTU_VERSION=22.04
|
||||||
|
# This needs to generally match the container host's environment.
|
||||||
|
ARG CUDA_VERSION=12.3.1
|
||||||
|
# Target the CUDA build image
|
||||||
|
ARG BASE_CUDA_DEV_CONTAINER=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
|
||||||
|
# Target the CUDA runtime image
|
||||||
|
ARG BASE_CUDA_RUN_CONTAINER=nvidia/cuda:${CUDA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_DEV_CONTAINER} AS build
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Unless otherwise specified, we make a fat build.
|
||||||
|
ARG CUDA_DOCKER_ARCH=all
|
||||||
|
# Set nvcc architecture
|
||||||
|
ENV CUDA_DOCKER_ARCH=${CUDA_DOCKER_ARCH}
|
||||||
|
# Enable cuBLAS
|
||||||
|
ENV WHISPER_CUBLAS=1
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||||
|
|
||||||
|
# Ref: https://stackoverflow.com/a/53464012
|
||||||
|
ENV CUDA_MAIN_VERSION=12.3
|
||||||
|
ENV LD_LIBRARY_PATH /usr/local/cuda-${CUDA_MAIN_VERSION}/compat:$LD_LIBRARY_PATH
|
||||||
|
|
||||||
|
COPY .. .
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ${BASE_CUDA_RUN_CONTAINER} AS runtime
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y curl ffmpeg \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||||
|
|
||||||
|
COPY --from=build /app /app
|
||||||
|
ENTRYPOINT [ "bash", "-c" ]
|
19
.devops/main.Dockerfile
Normal file
19
.devops/main.Dockerfile
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
FROM ubuntu:22.04 AS build
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y build-essential \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||||
|
|
||||||
|
COPY .. .
|
||||||
|
RUN make
|
||||||
|
|
||||||
|
FROM ubuntu:22.04 AS runtime
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y curl ffmpeg \
|
||||||
|
&& rm -rf /var/lib/apt/lists/* /var/cache/apt/archives/*
|
||||||
|
|
||||||
|
COPY --from=build /app /app
|
||||||
|
ENTRYPOINT [ "bash", "-c" ]
|
32
.github/workflows/build.yml
vendored
32
.github/workflows/build.yml
vendored
@ -117,7 +117,6 @@ jobs:
|
|||||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||||
set -e
|
set -e
|
||||||
apt update
|
apt update
|
||||||
apt install -y clang
|
|
||||||
apt install -y clang build-essential cmake libsdl2-dev
|
apt install -y clang build-essential cmake libsdl2-dev
|
||||||
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
|
cmake . -DWHISPER_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
|
||||||
make
|
make
|
||||||
@ -167,7 +166,7 @@ jobs:
|
|||||||
s2arc: x64
|
s2arc: x64
|
||||||
jnaPath: win32-x86-64
|
jnaPath: win32-x86-64
|
||||||
- sdl2: ON
|
- sdl2: ON
|
||||||
s2ver: 2.26.0
|
s2ver: 2.28.5
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
@ -224,11 +223,14 @@ jobs:
|
|||||||
- arch: Win32
|
- arch: Win32
|
||||||
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
|
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x86.zip
|
||||||
s2arc: x86
|
s2arc: x86
|
||||||
|
clblast: OFF
|
||||||
- arch: x64
|
- arch: x64
|
||||||
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
|
obzip: https://github.com/OpenMathLib/OpenBLAS/releases/download/v0.3.25/OpenBLAS-0.3.25-x64.zip
|
||||||
s2arc: x64
|
s2arc: x64
|
||||||
|
clblast: ON
|
||||||
|
clver: 1.6.1
|
||||||
- sdl2: ON
|
- sdl2: ON
|
||||||
s2ver: 2.26.0
|
s2ver: 2.28.5
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
@ -253,6 +255,18 @@ jobs:
|
|||||||
7z x sdl2.zip
|
7z x sdl2.zip
|
||||||
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
|
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
|
||||||
|
|
||||||
|
- name: Install OpenCL
|
||||||
|
if: matrix.clblast == 'ON'
|
||||||
|
run: vcpkg.exe --triplet=${{ matrix.arch }}-windows install opencl
|
||||||
|
|
||||||
|
- name: Fetch CLBlast and set CLBlast_DIR
|
||||||
|
if: matrix.clblast == 'ON'
|
||||||
|
run: |
|
||||||
|
C:/msys64/usr/bin/wget.exe -qO clblast.zip https://github.com/CNugteren/CLBlast/releases/download/${{ matrix.clver }}/CLBlast-${{ matrix.clver }}-windows-x64.zip
|
||||||
|
7z x clblast.zip
|
||||||
|
7z x CLBlast-${{ matrix.clver }}-windows-x64.7z
|
||||||
|
echo "CLBlast_DIR=$env:GITHUB_WORKSPACE/CLBlast-${{ matrix.clver }}-windows-x64/lib/cmake/CLBlast" >> $env:GITHUB_ENV
|
||||||
|
|
||||||
- name: Configure
|
- name: Configure
|
||||||
run: >
|
run: >
|
||||||
cmake -S . -B ./build -A ${{ matrix.arch }}
|
cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||||
@ -260,6 +274,7 @@ jobs:
|
|||||||
-DWHISPER_OPENBLAS=${{ matrix.blas }}
|
-DWHISPER_OPENBLAS=${{ matrix.blas }}
|
||||||
-DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
|
-DCMAKE_LIBRARY_PATH="$env:OPENBLAS_PATH/lib"
|
||||||
-DWHISPER_SDL2=${{ matrix.sdl2 }}
|
-DWHISPER_SDL2=${{ matrix.sdl2 }}
|
||||||
|
-DWHISPER_CLBLAST=${{ matrix.clblast }}
|
||||||
|
|
||||||
- name: Build
|
- name: Build
|
||||||
run: |
|
run: |
|
||||||
@ -274,11 +289,15 @@ jobs:
|
|||||||
if: matrix.sdl2 == 'ON'
|
if: matrix.sdl2 == 'ON'
|
||||||
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||||
|
|
||||||
|
- name: Copy clblast.dll
|
||||||
|
if: matrix.clblast == 'ON'
|
||||||
|
run: copy "$env:CLBlast_DIR/../../clblast.dll" build/bin/${{ matrix.build }}
|
||||||
|
|
||||||
- name: Upload binaries
|
- name: Upload binaries
|
||||||
if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
|
if: matrix.blas == 'ON' && matrix.sdl2 == 'ON'
|
||||||
uses: actions/upload-artifact@v1
|
uses: actions/upload-artifact@v1
|
||||||
with:
|
with:
|
||||||
name: whisper-blas-bin-${{ matrix.arch }}
|
name: whisper-blas${{ matrix.clblast == 'ON' && '-clblast' || ''}}-bin-${{ matrix.arch }}
|
||||||
path: build/bin/${{ matrix.build }}
|
path: build/bin/${{ matrix.build }}
|
||||||
|
|
||||||
windows-cublas:
|
windows-cublas:
|
||||||
@ -295,7 +314,7 @@ jobs:
|
|||||||
- arch: x64
|
- arch: x64
|
||||||
s2arc: x64
|
s2arc: x64
|
||||||
- sdl2: ON
|
- sdl2: ON
|
||||||
s2ver: 2.26.0
|
s2ver: 2.28.5
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
@ -321,7 +340,8 @@ jobs:
|
|||||||
run: >
|
run: >
|
||||||
cmake -S . -B ./build -A ${{ matrix.arch }}
|
cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||||
-DWHISPER_CUBLAS=1
|
-DWHISPER_CUBLAS=${{ matrix.cublas }}
|
||||||
|
-DWHISPER_SDL2=${{ matrix.sdl2 }}
|
||||||
|
|
||||||
- name: Build ${{ matrix.cuda-toolkit }}
|
- name: Build ${{ matrix.cuda-toolkit }}
|
||||||
run: |
|
run: |
|
||||||
|
57
.github/workflows/docker.yml
vendored
Normal file
57
.github/workflows/docker.yml
vendored
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
name: Publish Docker image
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- master
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
push_to_registry:
|
||||||
|
name: Push Docker image to Docker Hub
|
||||||
|
if: github.event.pull_request.draft == false
|
||||||
|
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
env:
|
||||||
|
COMMIT_SHA: ${{ github.sha }}
|
||||||
|
strategy:
|
||||||
|
matrix:
|
||||||
|
config:
|
||||||
|
- { tag: "main", dockerfile: ".devops/main.Dockerfile", platform: "linux/amd64,linux/arm64" }
|
||||||
|
- { tag: "main-cuda", dockerfile: ".devops/main-cuda.Dockerfile", platform: "linux/amd64" }
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Check out the repo
|
||||||
|
uses: actions/checkout@v3
|
||||||
|
|
||||||
|
- name: Set up QEMU
|
||||||
|
uses: docker/setup-qemu-action@v3
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Log in to Docker Hub
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: ghcr.io
|
||||||
|
username: ${{ github.repository_owner }}
|
||||||
|
password: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
|
||||||
|
- name: Build and push Docker image (versioned)
|
||||||
|
if: github.event_name == 'push'
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: true
|
||||||
|
platforms: ${{ matrix.config.platforms }}
|
||||||
|
tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}-${{ env.COMMIT_SHA }}"
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
||||||
|
|
||||||
|
- name: Build and push Docker image (tagged)
|
||||||
|
uses: docker/build-push-action@v4
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
push: ${{ github.event_name == 'push' }}
|
||||||
|
platforms: ${{ matrix.config.platforms }}
|
||||||
|
tags: "ghcr.io/${{ github.repository }}:${{ matrix.config.tag }}"
|
||||||
|
file: ${{ matrix.config.dockerfile }}
|
@ -1,6 +1,6 @@
|
|||||||
cmake_minimum_required (VERSION 3.5)
|
cmake_minimum_required (VERSION 3.5)
|
||||||
|
|
||||||
project(whisper.cpp VERSION 1.5.2)
|
project(whisper.cpp VERSION 1.5.4)
|
||||||
|
|
||||||
# Add path to modules
|
# Add path to modules
|
||||||
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/")
|
||||||
@ -218,11 +218,17 @@ if (WHISPER_CUBLAS)
|
|||||||
add_compile_definitions(GGML_USE_CUBLAS)
|
add_compile_definitions(GGML_USE_CUBLAS)
|
||||||
|
|
||||||
if (WHISPER_STATIC)
|
if (WHISPER_STATIC)
|
||||||
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
if (WIN32)
|
||||||
|
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
|
||||||
|
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||||
|
else ()
|
||||||
|
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||||
|
endif()
|
||||||
else()
|
else()
|
||||||
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cuda_driver)
|
||||||
else()
|
else()
|
||||||
message(FATAL_ERROR "cuBLAS not found")
|
message(FATAL_ERROR "cuBLAS not found")
|
||||||
endif()
|
endif()
|
||||||
@ -338,8 +344,8 @@ else()
|
|||||||
endif()
|
endif()
|
||||||
else()
|
else()
|
||||||
if (EMSCRIPTEN)
|
if (EMSCRIPTEN)
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -s TOTAL_STACK=5242880")
|
||||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
|
||||||
else()
|
else()
|
||||||
if(NOT WHISPER_NO_AVX)
|
if(NOT WHISPER_NO_AVX)
|
||||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
|
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
|
||||||
@ -521,7 +527,13 @@ endif()
|
|||||||
|
|
||||||
if (GGML_SOURCES_CUDA)
|
if (GGML_SOURCES_CUDA)
|
||||||
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
|
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
|
||||||
set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
|
# Only configure gmml CUDA architectures is not globally set
|
||||||
|
if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
|
||||||
|
# Not overriden by user, so set defaults
|
||||||
|
set(GGML_CUDA_ARCHITECTURES 52 61 70)
|
||||||
|
endif()
|
||||||
|
message(STATUS "GGML Configuring CUDA architectures ${GGML_CUDA_ARCHITECTURES}")
|
||||||
|
set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES ${GGML_CUDA_ARCHITECTURES})
|
||||||
set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
2
Makefile
2
Makefile
@ -206,7 +206,7 @@ ifdef WHISPER_CUBLAS
|
|||||||
|
|
||||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
||||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
||||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
|
LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
|
||||||
WHISPER_OBJ += ggml-cuda.o
|
WHISPER_OBJ += ggml-cuda.o
|
||||||
NVCC = nvcc
|
NVCC = nvcc
|
||||||
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
|
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
|
||||||
|
@ -13,9 +13,13 @@ let package = Package(
|
|||||||
products: [
|
products: [
|
||||||
.library(name: "whisper", targets: ["whisper"]),
|
.library(name: "whisper", targets: ["whisper"]),
|
||||||
],
|
],
|
||||||
|
dependencies: [
|
||||||
|
.package(url: "https://github.com/ggerganov/ggml.git", .branch("master"))
|
||||||
|
],
|
||||||
targets: [
|
targets: [
|
||||||
.target(
|
.target(
|
||||||
name: "whisper",
|
name: "whisper",
|
||||||
|
dependencies: ["ggml"],
|
||||||
path: ".",
|
path: ".",
|
||||||
exclude: [
|
exclude: [
|
||||||
"bindings",
|
"bindings",
|
||||||
@ -32,14 +36,8 @@ let package = Package(
|
|||||||
"Makefile"
|
"Makefile"
|
||||||
],
|
],
|
||||||
sources: [
|
sources: [
|
||||||
"ggml.c",
|
|
||||||
"whisper.cpp",
|
"whisper.cpp",
|
||||||
"ggml-alloc.c",
|
|
||||||
"ggml-backend.c",
|
|
||||||
"ggml-quants.c",
|
|
||||||
"ggml-metal.m"
|
|
||||||
],
|
],
|
||||||
resources: [.process("ggml-metal.metal")],
|
|
||||||
publicHeadersPath: "spm-headers",
|
publicHeadersPath: "spm-headers",
|
||||||
cSettings: [
|
cSettings: [
|
||||||
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
.unsafeFlags(["-Wno-shorten-64-to-32", "-O3", "-DNDEBUG"]),
|
||||||
|
33
README.md
33
README.md
@ -6,7 +6,7 @@
|
|||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT)
|
||||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||||
|
|
||||||
Stable: [v1.5.2](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.2) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
Stable: [v1.5.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.5.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||||
|
|
||||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||||
|
|
||||||
@ -33,6 +33,7 @@ Supported platforms:
|
|||||||
- [x] [WebAssembly](examples/whisper.wasm)
|
- [x] [WebAssembly](examples/whisper.wasm)
|
||||||
- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
|
- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
|
||||||
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
|
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
|
||||||
|
- [x] [docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
|
||||||
|
|
||||||
The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
|
The entire high-level implementation of the model is contained in [whisper.h](whisper.h) and [whisper.cpp](whisper.cpp).
|
||||||
The rest of the code is part of the [ggml](https://github.com/ggerganov/ggml) machine learning library.
|
The rest of the code is part of the [ggml](https://github.com/ggerganov/ggml) machine learning library.
|
||||||
@ -448,6 +449,36 @@ make clean
|
|||||||
WHISPER_OPENBLAS=1 make -j
|
WHISPER_OPENBLAS=1 make -j
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Docker
|
||||||
|
|
||||||
|
### Prerequisites
|
||||||
|
* Docker must be installed and running on your system.
|
||||||
|
* Create a folder to store big models & intermediate files (ex. /whisper/models)
|
||||||
|
|
||||||
|
### Images
|
||||||
|
We have two Docker images available for this project:
|
||||||
|
|
||||||
|
1. `ghcr.io/ggerganov/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
|
||||||
|
2. `ghcr.io/ggerganov/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||||
|
|
||||||
|
### Usage
|
||||||
|
|
||||||
|
```shell
|
||||||
|
# download model and persist it in a local folder
|
||||||
|
docker run -it --rm \
|
||||||
|
-v path/to/models:/models \
|
||||||
|
whisper.cpp:main "./models/download-ggml-model.sh base /models"
|
||||||
|
# transcribe an audio file
|
||||||
|
docker run -it --rm \
|
||||||
|
-v path/to/models:/models \
|
||||||
|
-v path/to/audios:/audios \
|
||||||
|
whisper.cpp:main "./main -m /models/ggml-base.bin -f /audios/jfk.wav"
|
||||||
|
# transcribe an audio file in samples folder
|
||||||
|
docker run -it --rm \
|
||||||
|
-v path/to/models:/models \
|
||||||
|
whisper.cpp:main "./main -m /models/ggml-base.bin -f ./samples/jfk.wav"
|
||||||
|
```
|
||||||
|
|
||||||
## Limitations
|
## Limitations
|
||||||
|
|
||||||
- Inference only
|
- Inference only
|
||||||
|
Submodule bindings/ios updated: 88c28eb833...b21b6ff325
@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "whisper.cpp",
|
"name": "whisper.cpp",
|
||||||
"version": "1.5.2",
|
"version": "1.5.4",
|
||||||
"description": "Whisper speech recognition",
|
"description": "Whisper speech recognition",
|
||||||
"main": "whisper.js",
|
"main": "whisper.js",
|
||||||
"scripts": {
|
"scripts": {
|
||||||
|
@ -70,7 +70,7 @@ extern "C" {
|
|||||||
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
// compute graph without a plan
|
// compute graph without a plan
|
||||||
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
// check if the backend supports an operation
|
// check if the backend supports an operation
|
||||||
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
@ -156,8 +156,8 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
|
|||||||
backend->iface.graph_plan_compute(backend, plan);
|
backend->iface.graph_plan_compute(backend, plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
backend->iface.graph_compute(backend, cgraph);
|
return backend->iface.graph_compute(backend, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
|
@ -52,7 +52,7 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
// tensor copy between different backends
|
// tensor copy between different backends
|
||||||
|
@ -24,9 +24,9 @@ struct whisper_coreml_context * whisper_coreml_init(const char * path_model) {
|
|||||||
|
|
||||||
// select which device to run the Core ML model on
|
// select which device to run the Core ML model on
|
||||||
MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
|
MLModelConfiguration *config = [[MLModelConfiguration alloc] init];
|
||||||
config.computeUnits = MLComputeUnitsCPUAndGPU;
|
// config.computeUnits = MLComputeUnitsCPUAndGPU;
|
||||||
//config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
|
//config.computeUnits = MLComputeUnitsCPUAndNeuralEngine;
|
||||||
//config.computeUnits = MLComputeUnitsAll;
|
config.computeUnits = MLComputeUnitsAll;
|
||||||
|
|
||||||
const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);
|
const void * data = CFBridgingRetain([[whisper_encoder_impl alloc] initWithContentsOfURL:url_model configuration:config error:nil]);
|
||||||
|
|
||||||
|
@ -14,6 +14,10 @@ if (WHISPER_SDL2)
|
|||||||
message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
|
message(STATUS "SDL2_LIBRARIES = ${SDL2_LIBRARIES}")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
|
if (WHISPER_CLBLAST)
|
||||||
|
find_package(CLBlast REQUIRED)
|
||||||
|
endif()
|
||||||
|
|
||||||
# common
|
# common
|
||||||
|
|
||||||
set(TARGET common)
|
set(TARGET common)
|
||||||
|
@ -1,30 +1,18 @@
|
|||||||
if (WHISPER_SDL2)
|
if (WHISPER_SDL2)
|
||||||
# talk-llama
|
# talk-llama
|
||||||
set(TARGET talk-llama)
|
set(TARGET talk-llama)
|
||||||
#add_executable(${TARGET} talk-llama.cpp llama.cpp)
|
add_executable(${TARGET} talk-llama.cpp llama.cpp)
|
||||||
#target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS})
|
||||||
#target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
|
|
||||||
# TODO: this is temporary
|
if (WHISPER_CLBLAST)
|
||||||
# need to export ggml symbols for MSVC, but too lazy ..
|
set(CLBLAST_LIBNAME clblast)
|
||||||
add_executable(${TARGET}
|
endif ()
|
||||||
talk-llama.cpp
|
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${SDL2_LIBRARIES} ${CLBLAST_LIBNAME} ${CMAKE_THREAD_LIBS_INIT})
|
||||||
llama.cpp
|
|
||||||
../common.cpp
|
|
||||||
../common-sdl.cpp
|
|
||||||
../../ggml.c
|
|
||||||
../../ggml-alloc.c
|
|
||||||
../../ggml-backend.c
|
|
||||||
../../ggml-quants.c
|
|
||||||
../../whisper.cpp)
|
|
||||||
|
|
||||||
if(WIN32)
|
if(WIN32)
|
||||||
# It requires Windows 8.1 or later for PrefetchVirtualMemory
|
# It requires Windows 8.1 or later for PrefetchVirtualMemory
|
||||||
target_compile_definitions(${TARGET} PRIVATE -D_WIN32_WINNT=0x0602)
|
target_compile_definitions(${TARGET} PRIVATE -D_WIN32_WINNT=0x0602)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
|
|
||||||
target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
|
|
||||||
include(DefaultTargetOptions)
|
include(DefaultTargetOptions)
|
||||||
endif ()
|
endif ()
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -39,10 +39,11 @@
|
|||||||
|
|
||||||
#define LLAMA_MAX_RNG_STATE (64*1024)
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||||
|
|
||||||
|
#define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla'
|
||||||
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
#define LLAMA_FILE_MAGIC_GGSN 0x6767736eu // 'ggsn'
|
||||||
|
|
||||||
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
#define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN
|
||||||
#define LLAMA_SESSION_VERSION 2
|
#define LLAMA_SESSION_VERSION 3
|
||||||
|
|
||||||
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_METAL)
|
||||||
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
|
||||||
@ -126,7 +127,7 @@ extern "C" {
|
|||||||
bool sorted;
|
bool sorted;
|
||||||
} llama_token_data_array;
|
} llama_token_data_array;
|
||||||
|
|
||||||
typedef void (*llama_progress_callback)(float progress, void *ctx);
|
typedef bool (*llama_progress_callback)(float progress, void *ctx);
|
||||||
|
|
||||||
// Input data for llama_decode
|
// Input data for llama_decode
|
||||||
// A llama_batch object can contain input about one or many sequences
|
// A llama_batch object can contain input about one or many sequences
|
||||||
@ -158,16 +159,38 @@ extern "C" {
|
|||||||
llama_seq_id all_seq_id; // used if seq_id == NULL
|
llama_seq_id all_seq_id; // used if seq_id == NULL
|
||||||
} llama_batch;
|
} llama_batch;
|
||||||
|
|
||||||
|
enum llama_model_kv_override_type {
|
||||||
|
LLAMA_KV_OVERRIDE_INT,
|
||||||
|
LLAMA_KV_OVERRIDE_FLOAT,
|
||||||
|
LLAMA_KV_OVERRIDE_BOOL,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct llama_model_kv_override {
|
||||||
|
char key[128];
|
||||||
|
enum llama_model_kv_override_type tag;
|
||||||
|
union {
|
||||||
|
int64_t int_value;
|
||||||
|
double float_value;
|
||||||
|
bool bool_value;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
struct llama_model_params {
|
struct llama_model_params {
|
||||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||||
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
int32_t main_gpu; // the GPU that is used for scratch and small tensors
|
||||||
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
|
||||||
|
|
||||||
// called with a progress value between 0 and 1, pass NULL to disable
|
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
|
||||||
|
// If the provided progress_callback returns true, model loading continues.
|
||||||
|
// If it returns false, model loading is immediately aborted.
|
||||||
llama_progress_callback progress_callback;
|
llama_progress_callback progress_callback;
|
||||||
|
|
||||||
// context pointer passed to the progress callback
|
// context pointer passed to the progress callback
|
||||||
void * progress_callback_user_data;
|
void * progress_callback_user_data;
|
||||||
|
|
||||||
|
// override key-value pairs of the model meta data
|
||||||
|
const struct llama_model_kv_override * kv_overrides;
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool vocab_only; // only load the vocabulary, no weights
|
bool vocab_only; // only load the vocabulary, no weights
|
||||||
bool use_mmap; // use mmap if possible
|
bool use_mmap; // use mmap if possible
|
||||||
@ -185,17 +208,20 @@ extern "C" {
|
|||||||
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
// ref: https://github.com/ggerganov/llama.cpp/pull/2054
|
||||||
float rope_freq_base; // RoPE base frequency, 0 = from model
|
float rope_freq_base; // RoPE base frequency, 0 = from model
|
||||||
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
float rope_freq_scale; // RoPE frequency scaling factor, 0 = from model
|
||||||
float yarn_ext_factor; // YaRN extrapolation mix factor, NaN = from model
|
float yarn_ext_factor; // YaRN extrapolation mix factor, negative = from model
|
||||||
float yarn_attn_factor; // YaRN magnitude scaling factor
|
float yarn_attn_factor; // YaRN magnitude scaling factor
|
||||||
float yarn_beta_fast; // YaRN low correction dim
|
float yarn_beta_fast; // YaRN low correction dim
|
||||||
float yarn_beta_slow; // YaRN high correction dim
|
float yarn_beta_slow; // YaRN high correction dim
|
||||||
uint32_t yarn_orig_ctx; // YaRN original context size
|
uint32_t yarn_orig_ctx; // YaRN original context size
|
||||||
|
|
||||||
|
enum ggml_type type_k; // data type for K cache
|
||||||
|
enum ggml_type type_v; // data type for V cache
|
||||||
|
|
||||||
// Keep the booleans together to avoid misalignment during copy-by-value.
|
// Keep the booleans together to avoid misalignment during copy-by-value.
|
||||||
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
|
||||||
bool f16_kv; // use fp16 for KV cache, fp32 otherwise
|
bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
||||||
bool logits_all; // the llama_eval() call computes all logits, not just the last one
|
bool embedding; // embedding mode only
|
||||||
bool embedding; // embedding mode only
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
@ -290,7 +316,9 @@ extern "C" {
|
|||||||
|
|
||||||
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
|
||||||
|
|
||||||
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
|
// TODO: become more consistent with returned int types across the API
|
||||||
|
LLAMA_API uint32_t llama_n_ctx (const struct llama_context * ctx);
|
||||||
|
LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx);
|
||||||
|
|
||||||
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
|
||||||
|
|
||||||
@ -301,6 +329,23 @@ extern "C" {
|
|||||||
// Get the model's RoPE frequency scaling factor
|
// Get the model's RoPE frequency scaling factor
|
||||||
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Functions to access the model's GGUF metadata scalar values
|
||||||
|
// - The functions return the length of the string on success, or -1 on failure
|
||||||
|
// - The output string is always null-terminated and cleared on failure
|
||||||
|
// - GGUF array values are not supported by these functions
|
||||||
|
|
||||||
|
// Get metadata value as a string by key name
|
||||||
|
LLAMA_API int llama_model_meta_val_str(const struct llama_model * model, const char * key, char * buf, size_t buf_size);
|
||||||
|
|
||||||
|
// Get the number of metadata key/value pairs
|
||||||
|
LLAMA_API int llama_model_meta_count(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Get metadata key name by index
|
||||||
|
LLAMA_API int llama_model_meta_key_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
||||||
|
|
||||||
|
// Get metadata value as a string by index
|
||||||
|
LLAMA_API int llama_model_meta_val_str_by_index(const struct llama_model * model, int i, char * buf, size_t buf_size);
|
||||||
|
|
||||||
// Get a string describing the model type
|
// Get a string describing the model type
|
||||||
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
LLAMA_API int llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size);
|
||||||
|
|
||||||
@ -344,9 +389,60 @@ extern "C" {
|
|||||||
// KV cache
|
// KV cache
|
||||||
//
|
//
|
||||||
|
|
||||||
// Returns the number of tokens in the KV cache
|
// Information associated with an individual cell in the KV cache view.
|
||||||
LLAMA_API DEPRECATED(int llama_get_kv_cache_token_count(const struct llama_context * ctx),
|
struct llama_kv_cache_view_cell {
|
||||||
"avoid using this, it will be removed in the future, instead - count the tokens in user code");
|
// The position for this cell. Takes KV cache shifts into account.
|
||||||
|
// May be negative if the cell is not populated.
|
||||||
|
llama_pos pos;
|
||||||
|
};
|
||||||
|
|
||||||
|
// An updateable view of the KV cache.
|
||||||
|
struct llama_kv_cache_view {
|
||||||
|
// Number of KV cache cells. This will be the same as the context size.
|
||||||
|
int32_t n_cells;
|
||||||
|
|
||||||
|
// Maximum number of sequences that can exist in a cell. It's not an error
|
||||||
|
// if there are more sequences in a cell than this value, however they will
|
||||||
|
// not be visible in the view cells_sequences.
|
||||||
|
int32_t n_max_seq;
|
||||||
|
|
||||||
|
// Number of tokens in the cache. For example, if there are two populated
|
||||||
|
// cells, the first with 1 sequence id in it and the second with 2 sequence
|
||||||
|
// ids then you'll have 3 tokens.
|
||||||
|
int32_t token_count;
|
||||||
|
|
||||||
|
// Number of populated cache cells.
|
||||||
|
int32_t used_cells;
|
||||||
|
|
||||||
|
// Maximum contiguous empty slots in the cache.
|
||||||
|
int32_t max_contiguous;
|
||||||
|
|
||||||
|
// Index to the start of the max_contiguous slot range. Can be negative
|
||||||
|
// when cache is full.
|
||||||
|
int32_t max_contiguous_idx;
|
||||||
|
|
||||||
|
// Information for an individual cell.
|
||||||
|
struct llama_kv_cache_view_cell * cells;
|
||||||
|
|
||||||
|
// The sequences for each cell. There will be n_max_seq items per cell.
|
||||||
|
llama_seq_id * cells_sequences;
|
||||||
|
};
|
||||||
|
|
||||||
|
// Create an empty KV cache view. (use only for debugging purposes)
|
||||||
|
LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
|
||||||
|
|
||||||
|
// Free a KV cache view. (use only for debugging purposes)
|
||||||
|
LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
|
||||||
|
|
||||||
|
// Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
|
||||||
|
LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
|
||||||
|
|
||||||
|
// Returns the number of tokens in the KV cache (slow, use only for debug)
|
||||||
|
// If a KV cell has multiple sequences assigned to it, it will be counted multiple times
|
||||||
|
LLAMA_API int llama_get_kv_cache_token_count(const struct llama_context * ctx);
|
||||||
|
|
||||||
|
// Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
|
||||||
|
LLAMA_API int llama_get_kv_cache_used_cells(const struct llama_context * ctx);
|
||||||
|
|
||||||
// Clear the KV cache
|
// Clear the KV cache
|
||||||
LLAMA_API void llama_kv_cache_clear(
|
LLAMA_API void llama_kv_cache_clear(
|
||||||
@ -517,6 +613,12 @@ extern "C" {
|
|||||||
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
|
||||||
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line
|
||||||
|
|
||||||
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||||
|
LLAMA_API int llama_add_bos_token(const struct llama_model * model);
|
||||||
|
|
||||||
|
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||||
|
LLAMA_API int llama_add_eos_token(const struct llama_model * model);
|
||||||
|
|
||||||
// codellama infill tokens
|
// codellama infill tokens
|
||||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||||
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
|
||||||
|
@ -282,7 +282,6 @@ int main(int argc, char ** argv) {
|
|||||||
// tune these to your liking
|
// tune these to your liking
|
||||||
lcparams.n_ctx = 2048;
|
lcparams.n_ctx = 2048;
|
||||||
lcparams.seed = 1;
|
lcparams.seed = 1;
|
||||||
lcparams.f16_kv = true;
|
|
||||||
lcparams.n_threads = params.n_threads;
|
lcparams.n_threads = params.n_threads;
|
||||||
|
|
||||||
struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
|
struct llama_context * ctx_llama = llama_new_context_with_model(model_llama, lcparams);
|
||||||
|
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|||||||
const int n_ctx = hparams.n_ctx;
|
const int n_ctx = hparams.n_ctx;
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
|
||||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
|
||||||
|
|
||||||
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
|
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
|
||||||
ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
|
ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
|
||||||
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
|
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
|
||||||
|
|
||||||
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
|
||||||
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
|
||||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
|
||||||
|
|
||||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
|
||||||
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
|
||||||
|
|
||||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
|
||||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
|
||||||
|
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
|
||||||
|
|
||||||
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
||||||
|
|
||||||
@ -524,8 +524,7 @@ bool gpt2_eval(
|
|||||||
struct ggml_tensor * KQ_scaled =
|
struct ggml_tensor * KQ_scaled =
|
||||||
ggml_scale(ctx0,
|
ggml_scale(ctx0,
|
||||||
KQ,
|
KQ,
|
||||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
1.0f/sqrt(float(n_embd)/n_head));
|
||||||
);
|
|
||||||
|
|
||||||
// KQ_masked = mask_past(KQ_scaled)
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
// [n_past + N, N, 12]
|
// [n_past + N, N, 12]
|
||||||
|
@ -155,33 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
|
|||||||
const int n_ctx = hparams.n_ctx;
|
const int n_ctx = hparams.n_ctx;
|
||||||
const int n_vocab = hparams.n_vocab;
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_g
|
||||||
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
ctx_size += ggml_row_size(GGML_TYPE_F32, n_embd); // ln_f_b
|
||||||
|
|
||||||
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
|
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // wte
|
||||||
ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
|
ctx_size += n_ctx*ggml_row_size(GGML_TYPE_F32, n_embd); // wpe
|
||||||
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
|
ctx_size += n_vocab*ggml_row_size(wtype, n_embd); // lm_head
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_g
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_1_b
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_g
|
||||||
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // ln_2_b
|
||||||
|
|
||||||
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 3*n_embd*n_embd)); // c_attn_attn_w
|
||||||
ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 3*n_embd)); // c_attn_attn_b
|
||||||
|
|
||||||
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
ctx_size += n_layer*(ggml_row_size(wtype, n_embd*n_embd)); // c_attn_proj_w
|
||||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_attn_proj_b
|
||||||
|
|
||||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_fc_w
|
||||||
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, 4*n_embd)); // c_mlp_fc_b
|
||||||
|
|
||||||
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
ctx_size += n_layer*(ggml_row_size(wtype, 4*n_embd*n_embd)); // c_mlp_proj_w
|
||||||
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
ctx_size += n_layer*(ggml_row_size(GGML_TYPE_F32, n_embd)); // c_mlp_proj_b
|
||||||
|
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_k
|
||||||
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
ctx_size += n_ctx*n_layer*ggml_row_size(GGML_TYPE_F32, n_embd); // memory_v
|
||||||
|
|
||||||
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
ctx_size += (6 + 12*n_layer)*256; // object overhead
|
||||||
|
|
||||||
@ -525,8 +525,7 @@ bool gpt2_eval(
|
|||||||
struct ggml_tensor * KQ_scaled =
|
struct ggml_tensor * KQ_scaled =
|
||||||
ggml_scale(ctx0,
|
ggml_scale(ctx0,
|
||||||
KQ,
|
KQ,
|
||||||
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
1.0f/sqrt(float(n_embd)/n_head));
|
||||||
);
|
|
||||||
|
|
||||||
// KQ_masked = mask_past(KQ_scaled)
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
// [n_past + N, N, 12]
|
// [n_past + N, N, 12]
|
||||||
|
@ -33,8 +33,13 @@ White's turn
|
|||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
|
|
||||||
- Improve web-browser audio capture - sometimes it does not record the voice properly
|
|
||||||
- Add support for more languages by making the generated grammar string multi-lingual
|
|
||||||
- Fix bugs in the chess moves logic
|
- Fix bugs in the chess moves logic
|
||||||
|
- Improve web-browser audio capture - sometimes it does not record the voice properly
|
||||||
|
- Add support for more languages by making the generated grammar string multilingual
|
||||||
|
- Explore ways to improve the dynamic grammar to be narrower
|
||||||
|
|
||||||
PRs welcome!
|
PRs welcome!
|
||||||
|
|
||||||
|
## Thanks
|
||||||
|
|
||||||
|
- [chessboardjs](https://chessboardjs.com) for the neat chessboard JS library used in this demo
|
||||||
|
2
examples/whisper.swiftui/.gitignore
vendored
Normal file
2
examples/whisper.swiftui/.gitignore
vendored
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
xcuserdata
|
||||||
|
xcshareddata
|
@ -61,7 +61,9 @@ models = [
|
|||||||
"ggml-small.bin",
|
"ggml-small.bin",
|
||||||
"ggml-medium.en.bin",
|
"ggml-medium.en.bin",
|
||||||
"ggml-medium.bin",
|
"ggml-medium.bin",
|
||||||
"ggml-large.bin",
|
"ggml-large-v1.bin",
|
||||||
|
"ggml-large-v2.bin",
|
||||||
|
"ggml-large-v3.bin",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
154
extra/sync-ggml-am.sh
Executable file
154
extra/sync-ggml-am.sh
Executable file
@ -0,0 +1,154 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
#
|
||||||
|
# Synchronize ggml changes to whisper.cpp
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
#
|
||||||
|
# $ cd /path/to/whisper.cpp
|
||||||
|
# $ ./extra/sync-ggml-am.sh
|
||||||
|
#
|
||||||
|
|
||||||
|
set -e
|
||||||
|
|
||||||
|
sd=$(dirname $0)
|
||||||
|
cd $sd/../
|
||||||
|
|
||||||
|
SRC_WHISPER=$(pwd)
|
||||||
|
SRC_GGML=$(cd ../ggml; pwd)
|
||||||
|
|
||||||
|
if [ ! -d $SRC_GGML ]; then
|
||||||
|
echo "ggml not found at $SRC_GGML"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
lc=$(cat $SRC_WHISPER/extra/sync-ggml.last)
|
||||||
|
echo "Syncing ggml changes since commit $lc"
|
||||||
|
|
||||||
|
cd $SRC_GGML
|
||||||
|
|
||||||
|
git log --oneline $lc..HEAD
|
||||||
|
git log --oneline $lc..HEAD --reverse | grep -v "(whisper/[0-9]*)" | cut -d' ' -f1 > $SRC_WHISPER/ggml-commits
|
||||||
|
|
||||||
|
if [ ! -s $SRC_WHISPER/ggml-commits ]; then
|
||||||
|
rm -v $SRC_WHISPER/ggml-commits
|
||||||
|
echo "No new commits"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -f $SRC_WHISPER/ggml-src.patch ]; then
|
||||||
|
rm -v $SRC_WHISPER/ggml-src.patch
|
||||||
|
fi
|
||||||
|
|
||||||
|
while read c; do
|
||||||
|
git format-patch -k $c~1..$c --stdout -- \
|
||||||
|
include/ggml/ggml*.h \
|
||||||
|
src/ggml*.h \
|
||||||
|
src/ggml*.c \
|
||||||
|
src/ggml*.cpp \
|
||||||
|
src/ggml*.m \
|
||||||
|
src/ggml*.metal \
|
||||||
|
src/ggml*.cu \
|
||||||
|
examples/common.h \
|
||||||
|
examples/common.cpp \
|
||||||
|
examples/common-ggml.h \
|
||||||
|
examples/common-ggml.cpp \
|
||||||
|
examples/whisper/whisper.h \
|
||||||
|
examples/whisper/whisper.cpp \
|
||||||
|
examples/whisper/main.cpp \
|
||||||
|
examples/whisper/quantize.cpp \
|
||||||
|
>> $SRC_WHISPER/ggml-src.patch
|
||||||
|
done < $SRC_WHISPER/ggml-commits
|
||||||
|
|
||||||
|
rm -v $SRC_WHISPER/ggml-commits
|
||||||
|
|
||||||
|
# delete files if empty
|
||||||
|
if [ ! -s $SRC_WHISPER/ggml-src.patch ]; then
|
||||||
|
rm -v $SRC_WHISPER/ggml-src.patch
|
||||||
|
fi
|
||||||
|
|
||||||
|
cd $SRC_WHISPER
|
||||||
|
|
||||||
|
if [ -f $SRC_WHISPER/ggml-src.patch ]; then
|
||||||
|
# replace PR numbers
|
||||||
|
#
|
||||||
|
# Subject: some text (#1234)
|
||||||
|
# Subject: some text (ggml/1234)
|
||||||
|
cat ggml-src.patch | sed -e 's/^Subject: \(.*\) (#\([0-9]*\))/Subject: \1 (ggml\/\2)/' > ggml-src.patch.tmp
|
||||||
|
mv ggml-src.patch.tmp ggml-src.patch
|
||||||
|
|
||||||
|
cat ggml-src.patch | sed -e 's/^\(.*\) (#\([0-9]*\))$/\1 (ggml\/\2)/' > ggml-src.patch.tmp
|
||||||
|
mv ggml-src.patch.tmp ggml-src.patch
|
||||||
|
|
||||||
|
# replace filenames:
|
||||||
|
#
|
||||||
|
# src/ggml.c -> ggml.c
|
||||||
|
# src/ggml-alloc.c -> ggml-alloc.c
|
||||||
|
# src/ggml-backend-impl.h -> ggml-backend-impl.h
|
||||||
|
# src/ggml-backend.c -> ggml-backend.c
|
||||||
|
# src/ggml-cuda.cu -> ggml-cuda.cu
|
||||||
|
# src/ggml-cuda.h -> ggml-cuda.h
|
||||||
|
# src/ggml-impl.h -> ggml-impl.h
|
||||||
|
# src/ggml-metal.h -> ggml-metal.h
|
||||||
|
# src/ggml-metal.m -> ggml-metal.m
|
||||||
|
# src/ggml-mpi.h -> ggml-mpi.h
|
||||||
|
# src/ggml-mpi.c -> ggml-mpi.c
|
||||||
|
# src/ggml-opencl.cpp -> ggml-opencl.cpp
|
||||||
|
# src/ggml-opencl.h -> ggml-opencl.h
|
||||||
|
# src/ggml-quants.c -> ggml-quants.c
|
||||||
|
# src/ggml-quants.h -> ggml-quants.h
|
||||||
|
# include/ggml/ggml.h -> ggml.h
|
||||||
|
# include/ggml/ggml-alloc.h -> ggml-alloc.h
|
||||||
|
# include/ggml/ggml-backend.h -> ggml-backend.h
|
||||||
|
#
|
||||||
|
# examples/common.h -> examples/common.h
|
||||||
|
# examples/common.cpp -> examples/common.cpp
|
||||||
|
# examples/common-ggml.h -> examples/common-ggml.h
|
||||||
|
# examples/common-ggml.cpp -> examples/common-ggml.cpp
|
||||||
|
#
|
||||||
|
# examples/whisper/whisper.h -> whisper.h
|
||||||
|
# examples/whisper/whisper.cpp -> whisper.cpp
|
||||||
|
# examples/whisper/main.cpp -> examples/main/main.cpp
|
||||||
|
# examples/whisper/quantize.cpp -> examples/quantize/quantize.cpp
|
||||||
|
|
||||||
|
cat ggml-src.patch | sed \
|
||||||
|
-e 's/src\/ggml\.c/ggml.c/g' \
|
||||||
|
-e 's/src\/ggml-alloc\.c/ggml-alloc.c/g' \
|
||||||
|
-e 's/src\/ggml-backend-impl\.h/ggml-backend-impl.h/g' \
|
||||||
|
-e 's/src\/ggml-backend\.c/ggml-backend.c/g' \
|
||||||
|
-e 's/src\/ggml-cuda\.cu/ggml-cuda.cu/g' \
|
||||||
|
-e 's/src\/ggml-cuda\.h/ggml-cuda.h/g' \
|
||||||
|
-e 's/src\/ggml-impl\.h/ggml-impl.h/g' \
|
||||||
|
-e 's/src\/ggml-metal\.h/ggml-metal.h/g' \
|
||||||
|
-e 's/src\/ggml-metal\.m/ggml-metal.m/g' \
|
||||||
|
-e 's/src\/ggml-mpi\.h/ggml-mpi.h/g' \
|
||||||
|
-e 's/src\/ggml-mpi\.c/ggml-mpi.c/g' \
|
||||||
|
-e 's/src\/ggml-opencl\.cpp/ggml-opencl.cpp/g' \
|
||||||
|
-e 's/src\/ggml-opencl\.h/ggml-opencl.h/g' \
|
||||||
|
-e 's/src\/ggml-quants\.c/ggml-quants.c/g' \
|
||||||
|
-e 's/src\/ggml-quants\.h/ggml-quants.h/g' \
|
||||||
|
-e 's/include\/ggml\/ggml\.h/ggml.h/g' \
|
||||||
|
-e 's/include\/ggml\/ggml-alloc\.h/ggml-alloc.h/g' \
|
||||||
|
-e 's/include\/ggml\/ggml-backend\.h/ggml-backend.h/g' \
|
||||||
|
-e 's/examples\/common\.h/examples\/common.h/g' \
|
||||||
|
-e 's/examples\/common\.cpp/examples\/common.cpp/g' \
|
||||||
|
-e 's/examples\/common-ggml\.h/examples\/common-ggml.h/g' \
|
||||||
|
-e 's/examples\/common-ggml\.cpp/examples\/common-ggml.cpp/g' \
|
||||||
|
-e 's/examples\/whisper\/whisper\.h/whisper.h/g' \
|
||||||
|
-e 's/examples\/whisper\/whisper\.cpp/whisper.cpp/g' \
|
||||||
|
-e 's/examples\/whisper\/main\.cpp/examples\/main\/main.cpp/g' \
|
||||||
|
-e 's/examples\/whisper\/quantize\.cpp/examples\/quantize\/quantize.cpp/g' \
|
||||||
|
> ggml-src.patch.tmp
|
||||||
|
mv ggml-src.patch.tmp ggml-src.patch
|
||||||
|
|
||||||
|
git am ggml-src.patch
|
||||||
|
|
||||||
|
rm -v $SRC_WHISPER/ggml-src.patch
|
||||||
|
fi
|
||||||
|
|
||||||
|
# update last commit
|
||||||
|
cd $SRC_GGML
|
||||||
|
git log -1 --format=%H > $SRC_WHISPER/extra/sync-ggml.last
|
||||||
|
|
||||||
|
echo "Done"
|
||||||
|
|
||||||
|
exit 0
|
1
extra/sync-ggml.last
Normal file
1
extra/sync-ggml.last
Normal file
@ -0,0 +1 @@
|
|||||||
|
3eace58911ea8d2cf35defdc59848d99b91a57f5
|
5
extra/sync-llama.sh
Executable file
5
extra/sync-llama.sh
Executable file
@ -0,0 +1,5 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
cp -rpv ../llama.cpp/llama.h ./examples/talk-llama/llama.h
|
||||||
|
cp -rpv ../llama.cpp/llama.cpp ./examples/talk-llama/llama.cpp
|
||||||
|
cp -rpv ../llama.cpp/unicode.h ./examples/talk-llama/unicode.h
|
18
ggml-alloc.c
18
ggml-alloc.c
@ -72,7 +72,7 @@ static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * t
|
|||||||
|
|
||||||
// check if a tensor is allocated by this buffer
|
// check if a tensor is allocated by this buffer
|
||||||
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
|
||||||
return tensor->buffer == alloc->buffer;
|
return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_is_view(struct ggml_tensor * t) {
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||||
@ -449,11 +449,10 @@ static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool upd
|
|||||||
if (update_backend) {
|
if (update_backend) {
|
||||||
view->backend = view->view_src->backend;
|
view->backend = view->view_src->backend;
|
||||||
}
|
}
|
||||||
view->buffer = view->view_src->buffer;
|
// views are initialized in the alloc buffer rather than the view_src buffer
|
||||||
|
view->buffer = alloc->buffer;
|
||||||
view->data = (char *)view->view_src->data + view->view_offs;
|
view->data = (char *)view->view_src->data + view->view_offs;
|
||||||
|
|
||||||
// FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
|
|
||||||
// due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
|
|
||||||
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
|
||||||
|
|
||||||
if (!alloc->measure) {
|
if (!alloc->measure) {
|
||||||
@ -736,6 +735,10 @@ void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ggml_allocr_free(ggml_allocr_t alloc) {
|
void ggml_allocr_free(ggml_allocr_t alloc) {
|
||||||
|
if (alloc == NULL) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
ggml_gallocr_free(alloc->galloc);
|
ggml_gallocr_free(alloc->galloc);
|
||||||
ggml_tallocr_free(alloc->talloc);
|
ggml_tallocr_free(alloc->talloc);
|
||||||
free(alloc);
|
free(alloc);
|
||||||
@ -775,7 +778,7 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (nbytes == 0) {
|
if (nbytes == 0) {
|
||||||
fprintf(stderr, "%s: no tensors to allocate\n", __func__);
|
// all the tensors in the context are already allocated
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -789,6 +792,11 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
|
|||||||
} else {
|
} else {
|
||||||
ggml_backend_view_init(buffer, t);
|
ggml_backend_view_init(buffer, t);
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (t->view_src != NULL) {
|
||||||
|
// view of a pre-allocated tensor
|
||||||
|
ggml_backend_view_init(buffer, t);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -20,6 +20,9 @@ extern "C" {
|
|||||||
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
|
||||||
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
|
||||||
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
|
||||||
|
// check if tensor data is in host memory
|
||||||
|
// should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
|
||||||
|
bool (*is_host) (ggml_backend_buffer_type_t buft);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer_type {
|
struct ggml_backend_buffer_type {
|
||||||
@ -31,15 +34,16 @@ extern "C" {
|
|||||||
typedef void * ggml_backend_buffer_context_t;
|
typedef void * ggml_backend_buffer_context_t;
|
||||||
|
|
||||||
struct ggml_backend_buffer_i {
|
struct ggml_backend_buffer_i {
|
||||||
void (*free_buffer)(ggml_backend_buffer_t buffer);
|
void (*free_buffer) (ggml_backend_buffer_t buffer);
|
||||||
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
//void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras
|
||||||
void * (*get_base) (ggml_backend_buffer_t buffer);
|
void * (*get_base) (ggml_backend_buffer_t buffer);
|
||||||
void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
||||||
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
||||||
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
// (optional) copy tensor between different buffer-type, allow for single-copy tranfers
|
||||||
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
void (*clear) (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
};
|
};
|
||||||
|
|
||||||
struct ggml_backend_buffer {
|
struct ggml_backend_buffer {
|
||||||
@ -78,7 +82,7 @@ extern "C" {
|
|||||||
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
|
||||||
|
|
||||||
void (*synchronize) (ggml_backend_t backend);
|
void (*synchronize)(ggml_backend_t backend);
|
||||||
|
|
||||||
// compute graph with a plan
|
// compute graph with a plan
|
||||||
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
@ -86,7 +90,7 @@ extern "C" {
|
|||||||
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
|
|
||||||
// compute graph without a plan
|
// compute graph without a plan
|
||||||
void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
|
|
||||||
// check if the backend supports an operation
|
// check if the backend supports an operation
|
||||||
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
114
ggml-backend.c
114
ggml-backend.c
@ -35,6 +35,13 @@ bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_ba
|
|||||||
return buft->iface.supports_backend(buft, backend);
|
return buft->iface.supports_backend(buft, backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
if (buft->iface.is_host) {
|
||||||
|
return buft->iface.is_host(buft);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// backend buffer
|
// backend buffer
|
||||||
|
|
||||||
ggml_backend_buffer_t ggml_backend_buffer_init(
|
ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||||
@ -94,6 +101,14 @@ size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct g
|
|||||||
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
buffer->iface.clear(buffer, value);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
|
||||||
|
return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
|
||||||
return buffer->buft;
|
return buffer->buft;
|
||||||
}
|
}
|
||||||
@ -180,11 +195,14 @@ void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_
|
|||||||
ggml_backend_synchronize(backend);
|
ggml_backend_synchronize(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
backend->iface.graph_compute(backend, cgraph);
|
if (!backend->iface.graph_compute(backend, cgraph)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: optional sync
|
// TODO: optional sync
|
||||||
ggml_backend_synchronize(backend);
|
ggml_backend_synchronize(backend);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
@ -282,7 +300,7 @@ static void ggml_backend_registry_init(void) {
|
|||||||
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
|
||||||
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
GGML_ASSERT(ggml_backend_registry_count < GGML_MAX_BACKENDS_REG);
|
||||||
|
|
||||||
int id = ggml_backend_registry_count;
|
size_t id = ggml_backend_registry_count;
|
||||||
|
|
||||||
ggml_backend_registry[id] = (struct ggml_backend_reg) {
|
ggml_backend_registry[id] = (struct ggml_backend_reg) {
|
||||||
/* .name = */ {0},
|
/* .name = */ {0},
|
||||||
@ -315,6 +333,8 @@ size_t ggml_backend_reg_find_by_name(const char * name) {
|
|||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// not found
|
||||||
return SIZE_MAX;
|
return SIZE_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -325,15 +345,15 @@ ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str)
|
|||||||
const char * params = strchr(backend_str, ':');
|
const char * params = strchr(backend_str, ':');
|
||||||
char backend_name[128];
|
char backend_name[128];
|
||||||
if (params == NULL) {
|
if (params == NULL) {
|
||||||
strcpy(backend_name, backend_str);
|
snprintf(backend_name, sizeof(backend_name), "%s", backend_str);
|
||||||
params = "";
|
params = "";
|
||||||
} else {
|
} else {
|
||||||
strncpy(backend_name, backend_str, params - backend_str);
|
snprintf(backend_name, sizeof(backend_name), "%.*s", (int)(params - backend_str), backend_str);
|
||||||
backend_name[params - backend_str] = '\0';
|
|
||||||
params++;
|
params++;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
|
size_t backend_i = ggml_backend_reg_find_by_name(backend_name);
|
||||||
|
|
||||||
if (backend_i == SIZE_MAX) {
|
if (backend_i == SIZE_MAX) {
|
||||||
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
|
fprintf(stderr, "%s: backend %s not found\n", __func__, backend_name);
|
||||||
return NULL;
|
return NULL;
|
||||||
@ -378,22 +398,15 @@ static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
|
|||||||
|
|
||||||
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
free(buffer->context);
|
free(buffer->context);
|
||||||
GGML_UNUSED(buffer);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
@ -411,6 +424,10 @@ static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer,
|
|||||||
GGML_UNUSED(buffer);
|
GGML_UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
memset(buffer->context, value, buffer->size);
|
||||||
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
||||||
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
|
||||||
@ -419,6 +436,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
|
|||||||
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
||||||
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
||||||
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
// for buffers from ptr, free is not called
|
// for buffers from ptr, free is not called
|
||||||
@ -430,6 +448,7 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
|
|||||||
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
/* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
|
||||||
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
/* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
|
||||||
|
/* .clear = */ ggml_backend_cpu_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
|
||||||
@ -455,20 +474,70 @@ static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_ty
|
|||||||
GGML_UNUSED(buft);
|
GGML_UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
return true;
|
||||||
|
|
||||||
|
GGML_UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
|
||||||
static struct ggml_backend_buffer_type ggml_backend_buffer_type_cpu = {
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
|
||||||
/* .iface = */ {
|
/* .iface = */ {
|
||||||
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
|
||||||
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
|
|
||||||
return &ggml_backend_buffer_type_cpu;
|
return &ggml_backend_cpu_buffer_type;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
|
||||||
|
// buffer type HBM
|
||||||
|
|
||||||
|
#include <hbwmalloc.h>
|
||||||
|
|
||||||
|
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
|
hbw_free(buffer->context);
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
|
//void * ptr = hbw_malloc(size);
|
||||||
|
void * ptr;
|
||||||
|
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
|
||||||
|
if (result != 0) {
|
||||||
|
fprintf(stderr, "failed to allocate HBM buffer of size %zu\n", size);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: this is a hack to avoid having to implement a new buffer type
|
||||||
|
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
|
||||||
|
buffer->buft = buft;
|
||||||
|
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
|
||||||
|
|
||||||
|
return buffer;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
|
||||||
|
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
|
||||||
|
/* .iface = */ {
|
||||||
|
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
|
||||||
|
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
|
||||||
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
|
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
|
||||||
|
},
|
||||||
|
/* .context = */ NULL,
|
||||||
|
};
|
||||||
|
|
||||||
|
return &ggml_backend_cpu_buffer_type_hbm;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
struct ggml_backend_cpu_context {
|
struct ggml_backend_cpu_context {
|
||||||
int n_threads;
|
int n_threads;
|
||||||
void * work_data;
|
void * work_data;
|
||||||
@ -505,7 +574,7 @@ static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend
|
|||||||
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
|
||||||
|
|
||||||
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||||
cpu_plan->cgraph = *cgraph;
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
||||||
|
|
||||||
if (cpu_plan->cplan.work_size > 0) {
|
if (cpu_plan->cplan.work_size > 0) {
|
||||||
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
|
||||||
@ -531,7 +600,7 @@ static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_bac
|
|||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
static bool ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
|
||||||
|
|
||||||
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
||||||
@ -545,13 +614,18 @@ static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_c
|
|||||||
cplan.work_data = cpu_ctx->work_data;
|
cplan.work_data = cpu_ctx->work_data;
|
||||||
|
|
||||||
ggml_graph_compute(cgraph, &cplan);
|
ggml_graph_compute(cgraph, &cplan);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
return true;
|
switch (op->op) {
|
||||||
|
case GGML_OP_MUL_MAT:
|
||||||
|
return op->src[1]->type == GGML_TYPE_F32 || op->src[1]->type == ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
||||||
|
default:
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
GGML_UNUSED(op);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_i cpu_backend_i = {
|
static struct ggml_backend_i cpu_backend_i = {
|
||||||
@ -1180,7 +1254,7 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml
|
|||||||
// utils
|
// utils
|
||||||
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
|
||||||
GGML_ASSERT(tensor->buffer == NULL);
|
GGML_ASSERT(tensor->buffer == NULL);
|
||||||
GGML_ASSERT(tensor->data == NULL);
|
//GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
|
||||||
GGML_ASSERT(tensor->view_src != NULL);
|
GGML_ASSERT(tensor->view_src != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
GGML_ASSERT(tensor->view_src->buffer != NULL);
|
||||||
GGML_ASSERT(tensor->view_src->data != NULL);
|
GGML_ASSERT(tensor->view_src->data != NULL);
|
||||||
|
@ -21,6 +21,7 @@ extern "C" {
|
|||||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||||
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
|
||||||
|
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
|
||||||
|
|
||||||
// buffer
|
// buffer
|
||||||
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
|
||||||
@ -29,6 +30,8 @@ extern "C" {
|
|||||||
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
|
||||||
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
|
||||||
|
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
|
||||||
|
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer);
|
||||||
|
|
||||||
//
|
//
|
||||||
@ -55,7 +58,7 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
|
||||||
GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
GGML_API bool ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
|
||||||
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
|
||||||
|
|
||||||
// tensor copy between different backends
|
// tensor copy between different backends
|
||||||
@ -76,6 +79,10 @@ extern "C" {
|
|||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void);
|
||||||
|
|
||||||
|
#ifdef GGML_USE_CPU_HBM
|
||||||
|
GGML_API ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void);
|
||||||
|
#endif
|
||||||
|
|
||||||
//
|
//
|
||||||
// Backend registry
|
// Backend registry
|
||||||
//
|
//
|
||||||
|
1394
ggml-cuda.cu
1394
ggml-cuda.cu
File diff suppressed because it is too large
Load Diff
@ -87,7 +87,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
|||||||
|
|
||||||
// same as ggml_graph_compute but uses Metal
|
// same as ggml_graph_compute but uses Metal
|
||||||
// creates gf->n_threads command buffers in parallel
|
// creates gf->n_threads command buffers in parallel
|
||||||
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
||||||
|
|
||||||
//
|
//
|
||||||
// backend API
|
// backend API
|
||||||
@ -98,7 +98,10 @@ GGML_API ggml_backend_t ggml_backend_metal_init(void);
|
|||||||
|
|
||||||
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
|
||||||
|
|
||||||
|
GGML_API ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size);
|
||||||
|
|
||||||
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
|
||||||
|
|
||||||
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
GGML_API ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
|
||||||
|
|
||||||
// helper to check if the device supports a specific family
|
// helper to check if the device supports a specific family
|
||||||
|
302
ggml-metal.m
302
ggml-metal.m
@ -87,6 +87,7 @@ struct ggml_metal_context {
|
|||||||
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
||||||
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
||||||
|
GGML_METAL_DECL_KERNEL(get_rows_i32);
|
||||||
GGML_METAL_DECL_KERNEL(rms_norm);
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
||||||
GGML_METAL_DECL_KERNEL(group_norm);
|
GGML_METAL_DECL_KERNEL(group_norm);
|
||||||
GGML_METAL_DECL_KERNEL(norm);
|
GGML_METAL_DECL_KERNEL(norm);
|
||||||
@ -180,7 +181,15 @@ struct ggml_metal_context {
|
|||||||
@implementation GGMLMetalClass
|
@implementation GGMLMetalClass
|
||||||
@end
|
@end
|
||||||
|
|
||||||
ggml_log_callback ggml_metal_log_callback = NULL;
|
|
||||||
|
static void ggml_metal_default_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
||||||
|
fprintf(stderr, "%s", msg);
|
||||||
|
|
||||||
|
UNUSED(level);
|
||||||
|
UNUSED(user_data);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_log_callback ggml_metal_log_callback = ggml_metal_default_log_callback;
|
||||||
void * ggml_metal_log_user_data = NULL;
|
void * ggml_metal_log_user_data = NULL;
|
||||||
|
|
||||||
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data) {
|
||||||
@ -251,6 +260,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
NSError * error = nil;
|
NSError * error = nil;
|
||||||
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
|
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
|
||||||
if (libPath != nil) {
|
if (libPath != nil) {
|
||||||
|
// pre-compiled library found
|
||||||
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
||||||
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
||||||
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
||||||
@ -283,6 +293,13 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
options = [MTLCompileOptions new];
|
options = [MTLCompileOptions new];
|
||||||
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
||||||
#endif
|
#endif
|
||||||
|
// try to disable fast-math
|
||||||
|
// NOTE: this seems to have no effect whatsoever
|
||||||
|
// instead, in order to disable fast-math, we have to build default.metallib from the command line
|
||||||
|
// using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
||||||
|
// and go through the "pre-compiled library found" path above
|
||||||
|
//[options setFastMathEnabled:false];
|
||||||
|
|
||||||
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -361,6 +378,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||||||
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
||||||
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
||||||
|
GGML_METAL_ADD_KERNEL(get_rows_i32);
|
||||||
GGML_METAL_ADD_KERNEL(rms_norm);
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
||||||
GGML_METAL_ADD_KERNEL(group_norm);
|
GGML_METAL_ADD_KERNEL(group_norm);
|
||||||
GGML_METAL_ADD_KERNEL(norm);
|
GGML_METAL_ADD_KERNEL(norm);
|
||||||
@ -483,6 +501,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|||||||
GGML_METAL_DEL_KERNEL(get_rows_q4_K);
|
GGML_METAL_DEL_KERNEL(get_rows_q4_K);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q5_K);
|
GGML_METAL_DEL_KERNEL(get_rows_q5_K);
|
||||||
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
||||||
|
GGML_METAL_DEL_KERNEL(get_rows_i32);
|
||||||
GGML_METAL_DEL_KERNEL(rms_norm);
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
||||||
GGML_METAL_DEL_KERNEL(group_norm);
|
GGML_METAL_DEL_KERNEL(group_norm);
|
||||||
GGML_METAL_DEL_KERNEL(norm);
|
GGML_METAL_DEL_KERNEL(norm);
|
||||||
@ -607,12 +626,24 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// temporarily defined here for compatibility between ggml-backend and the old API
|
// temporarily defined here for compatibility between ggml-backend and the old API
|
||||||
struct ggml_backend_metal_buffer_context {
|
|
||||||
void * data;
|
struct ggml_backend_metal_buffer {
|
||||||
|
void * data;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
id<MTLBuffer> metal;
|
id<MTLBuffer> metal;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct ggml_backend_metal_buffer_context {
|
||||||
|
void * all_data;
|
||||||
|
size_t all_size;
|
||||||
|
bool owned;
|
||||||
|
|
||||||
|
// multiple buffers are used only to avoid the maximum buffer size limitation when using mmap
|
||||||
|
int n_buffers;
|
||||||
|
struct ggml_backend_metal_buffer buffers[GGML_METAL_MAX_BUFFERS];
|
||||||
|
};
|
||||||
|
|
||||||
// finds the Metal buffer that contains the tensor data on the GPU device
|
// finds the Metal buffer that contains the tensor data on the GPU device
|
||||||
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
// the assumption is that there is 1-to-1 mapping between the host and device memory buffers, so we can find the
|
||||||
// Metal buffer based on the host memory pointer
|
// Metal buffer based on the host memory pointer
|
||||||
@ -622,17 +653,29 @@ static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, stru
|
|||||||
|
|
||||||
const int64_t tsize = ggml_nbytes(t);
|
const int64_t tsize = ggml_nbytes(t);
|
||||||
|
|
||||||
|
ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer;
|
||||||
|
|
||||||
// compatibility with ggml-backend
|
// compatibility with ggml-backend
|
||||||
if (t->buffer && t->buffer->buft == ggml_backend_metal_buffer_type()) {
|
if (buffer && buffer->buft == ggml_backend_metal_buffer_type()) {
|
||||||
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) t->buffer->context;
|
struct ggml_backend_metal_buffer_context * buf_ctx = (struct ggml_backend_metal_buffer_context *) buffer->context;
|
||||||
|
|
||||||
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->data;
|
// find the view that contains the tensor fully
|
||||||
|
for (int i = 0; i < buf_ctx->n_buffers; ++i) {
|
||||||
|
const int64_t ioffs = (int64_t) t->data - (int64_t) buf_ctx->buffers[i].data;
|
||||||
|
|
||||||
GGML_ASSERT(ioffs >= 0 && ioffs + tsize <= (int64_t) t->buffer->size);
|
//GGML_METAL_LOG_INFO("ioffs = %10ld, tsize = %10ld, sum = %10ld, buf_ctx->buffers[%d].size = %10ld\n", ioffs, tsize, ioffs + tsize, i, buf_ctx->buffers[i].size);
|
||||||
|
if (ioffs >= 0 && ioffs + tsize <= (int64_t) buf_ctx->buffers[i].size) {
|
||||||
|
*offs = (size_t) ioffs;
|
||||||
|
|
||||||
*offs = (size_t) ioffs;
|
//GGML_METAL_LOG_INFO("%s: tensor '%16s', offs = %8ld\n", __func__, t->name, *offs);
|
||||||
|
|
||||||
return buf_ctx->metal;
|
return buf_ctx->buffers[i].metal;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: tensor '%s' buffer is nil\n", __func__, t->name);
|
||||||
|
|
||||||
|
return nil;
|
||||||
}
|
}
|
||||||
|
|
||||||
// find the view that contains the tensor fully
|
// find the view that contains the tensor fully
|
||||||
@ -934,7 +977,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
void ggml_metal_graph_compute(
|
bool ggml_metal_graph_compute(
|
||||||
struct ggml_metal_context * ctx,
|
struct ggml_metal_context * ctx,
|
||||||
struct ggml_cgraph * gf) {
|
struct ggml_cgraph * gf) {
|
||||||
@autoreleasepool {
|
@autoreleasepool {
|
||||||
@ -1198,7 +1241,7 @@ void ggml_metal_graph_compute(
|
|||||||
// not sure how to avoid this
|
// not sure how to avoid this
|
||||||
// TODO: make a simpler cpy_bytes kernel
|
// TODO: make a simpler cpy_bytes kernel
|
||||||
|
|
||||||
const int nth = MIN(1024, ne00);
|
const int nth = MIN((int) ctx->pipeline_cpy_f32_f32.maxTotalThreadsPerThreadgroup, ne00);
|
||||||
|
|
||||||
[encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
|
[encoder setComputePipelineState:ctx->pipeline_cpy_f32_f32];
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
@ -1253,7 +1296,7 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
|
[encoder setBytes:&pnb3 length:sizeof(pnb3) atIndex:26];
|
||||||
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];
|
[encoder setBytes:&offs length:sizeof(offs) atIndex:27];
|
||||||
|
|
||||||
const int nth = MIN(1024, ne0);
|
const int nth = MIN((int) ctx->pipeline_add.maxTotalThreadsPerThreadgroup, ne00);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne11, ne12, ne13) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
@ -1261,7 +1304,7 @@ void ggml_metal_graph_compute(
|
|||||||
{
|
{
|
||||||
GGML_ASSERT(ggml_is_contiguous(src0));
|
GGML_ASSERT(ggml_is_contiguous(src0));
|
||||||
|
|
||||||
const float scale = *(const float *) src1->data;
|
const float scale = *(const float *) dst->op_params;
|
||||||
|
|
||||||
int64_t n = ggml_nelements(dst);
|
int64_t n = ggml_nelements(dst);
|
||||||
|
|
||||||
@ -1272,8 +1315,8 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setComputePipelineState:ctx->pipeline_scale];
|
[encoder setComputePipelineState:ctx->pipeline_scale];
|
||||||
}
|
}
|
||||||
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||||
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
[encoder setBytes:&scale length:sizeof(scale) atIndex:2];
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||||
@ -1617,6 +1660,10 @@ void ggml_metal_graph_compute(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (ggml_is_quantized(src0t)) {
|
||||||
|
GGML_ASSERT(ne00 >= nth0*nth1);
|
||||||
|
}
|
||||||
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
@ -1675,6 +1722,9 @@ void ggml_metal_graph_compute(
|
|||||||
// TODO: make this more general
|
// TODO: make this more general
|
||||||
GGML_ASSERT(n_as <= 8);
|
GGML_ASSERT(n_as <= 8);
|
||||||
|
|
||||||
|
// max size of the src1ids array in the kernel stack
|
||||||
|
GGML_ASSERT(ne11 <= 512);
|
||||||
|
|
||||||
struct ggml_tensor * src2 = gf->nodes[i]->src[2];
|
struct ggml_tensor * src2 = gf->nodes[i]->src[2];
|
||||||
|
|
||||||
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
const int64_t ne20 = src2 ? src2->ne[0] : 0;
|
||||||
@ -1692,9 +1742,6 @@ void ggml_metal_graph_compute(
|
|||||||
GGML_ASSERT(!ggml_is_transposed(src2));
|
GGML_ASSERT(!ggml_is_transposed(src2));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
|
|
||||||
GGML_ASSERT(ne20 % 32 == 0);
|
|
||||||
// !!!!!!!!! TODO: this assert is probably required but not sure!
|
|
||||||
//GGML_ASSERT(ne20 >= 64);
|
|
||||||
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
GGML_ASSERT(src1t == GGML_TYPE_F32);
|
||||||
|
|
||||||
const uint r2 = ne12/ne22;
|
const uint r2 = ne12/ne22;
|
||||||
@ -1702,22 +1749,22 @@ void ggml_metal_graph_compute(
|
|||||||
|
|
||||||
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
// find the break-even point where the matrix-matrix kernel becomes more efficient compared
|
||||||
// to the matrix-vector kernel
|
// to the matrix-vector kernel
|
||||||
int ne11_mm_min = 1;
|
int ne11_mm_min = n_as;
|
||||||
|
|
||||||
const int idx = ((int32_t *) dst->op_params)[0];
|
const int idx = ((int32_t *) dst->op_params)[0];
|
||||||
|
|
||||||
// batch size
|
// batch size
|
||||||
GGML_ASSERT(ne01 == ne11);
|
GGML_ASSERT(ne01 == ne11);
|
||||||
|
|
||||||
const int64_t _ne1 = 1; // kernel_mul_mm_impl needs a reference in constant memory
|
|
||||||
|
|
||||||
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
// for now the matrix-matrix multiplication kernel only works on A14+/M1+ SoCs
|
||||||
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
// AMD GPU and older A-chips will reuse matrix-vector multiplication kernel
|
||||||
// !!!
|
// !!!
|
||||||
// TODO: for now, always use mat-vec kernels until we figure out how to improve the
|
// TODO: for now, always use mat-vec kernels until we figure out how to improve the
|
||||||
// indirect matrix multiplication
|
// indirect matrix multiplication
|
||||||
// !!!
|
// !!!
|
||||||
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] && _ne1 > ne11_mm_min) {
|
if ([ctx->device supportsFamily:MTLGPUFamilyApple7] &&
|
||||||
|
ne20 % 32 == 0 && ne20 >= 64 &&
|
||||||
|
ne11 > ne11_mm_min) {
|
||||||
switch (src2->type) {
|
switch (src2->type) {
|
||||||
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break;
|
case GGML_TYPE_F32: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f32_f32]; break;
|
||||||
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break;
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_f16_f32]; break;
|
||||||
@ -1747,14 +1794,15 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
|
[encoder setBytes:&nb11 length:sizeof(nb11) atIndex:11];
|
||||||
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
|
[encoder setBytes:&nb12 length:sizeof(nb12) atIndex:12];
|
||||||
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
[encoder setBytes:&ne0 length:sizeof(ne0) atIndex:13];
|
||||||
[encoder setBytes:&_ne1 length:sizeof(_ne1) atIndex:14];
|
[encoder setBytes:&ne1 length:sizeof(ne1) atIndex:14];
|
||||||
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
[encoder setBytes:&nb1 length:sizeof(nb1) atIndex:15];
|
||||||
[encoder setBytes:&r2 length:sizeof(r2) atIndex:16];
|
[encoder setBytes:&r2 length:sizeof(r2) atIndex:16];
|
||||||
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
|
[encoder setBytes:&r3 length:sizeof(r3) atIndex:17];
|
||||||
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
|
[encoder setBytes:&idx length:sizeof(idx) atIndex:18];
|
||||||
// TODO: how to make this an array? read Metal docs
|
// TODO: how to make this an array? read Metal docs
|
||||||
for (int j = 0; j < n_as; ++j) {
|
for (int j = 0; j < 8; ++j) {
|
||||||
struct ggml_tensor * src_cur = dst->src[2 + j];
|
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
|
||||||
|
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
||||||
|
|
||||||
size_t offs_src_cur = 0;
|
size_t offs_src_cur = 0;
|
||||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
||||||
@ -1764,8 +1812,7 @@ void ggml_metal_graph_compute(
|
|||||||
|
|
||||||
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
[encoder setThreadgroupMemoryLength:8192 atIndex:0];
|
||||||
|
|
||||||
// TODO: processing one row at a time (ne11 -> 1) is not efficient
|
[encoder dispatchThreadgroups:MTLSizeMake((ne11 + 31)/32, (ne21 + 63)/64, n_as*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake( (_ne1 + 31)/32, (ne21 + 63)/64, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(128, 1, 1)];
|
|
||||||
} else {
|
} else {
|
||||||
int nth0 = 32;
|
int nth0 = 32;
|
||||||
int nth1 = 1;
|
int nth1 = 1;
|
||||||
@ -1848,11 +1895,17 @@ void ggml_metal_graph_compute(
|
|||||||
} break;
|
} break;
|
||||||
default:
|
default:
|
||||||
{
|
{
|
||||||
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
||||||
GGML_ASSERT(false && "not implemented");
|
GGML_ASSERT(false && "not implemented");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (ggml_is_quantized(src2t)) {
|
||||||
|
GGML_ASSERT(ne20 >= nth0*nth1);
|
||||||
|
}
|
||||||
|
|
||||||
|
const int64_t _ne1 = 1; // kernels needs a reference in constant memory
|
||||||
|
|
||||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||||
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
|
||||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
|
||||||
@ -1877,8 +1930,9 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
|
[encoder setBytes:&r3 length:sizeof(r3) atIndex:21];
|
||||||
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
|
[encoder setBytes:&idx length:sizeof(idx) atIndex:22];
|
||||||
// TODO: how to make this an array? read Metal docs
|
// TODO: how to make this an array? read Metal docs
|
||||||
for (int j = 0; j < n_as; ++j) {
|
for (int j = 0; j < 8; ++j) {
|
||||||
struct ggml_tensor * src_cur = dst->src[2 + j];
|
// NOTE: this is done like this to avoid uninitialized kernel arguments when n_as < 8
|
||||||
|
struct ggml_tensor * src_cur = dst->src[2 + (j % n_as)];
|
||||||
|
|
||||||
size_t offs_src_cur = 0;
|
size_t offs_src_cur = 0;
|
||||||
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
id<MTLBuffer> id_src_cur = ggml_metal_get_buffer(ctx, src_cur, &offs_src_cur);
|
||||||
@ -1927,6 +1981,7 @@ void ggml_metal_graph_compute(
|
|||||||
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
||||||
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
||||||
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
||||||
|
case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
|
||||||
default: GGML_ASSERT(false && "not implemented");
|
default: GGML_ASSERT(false && "not implemented");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2197,7 +2252,7 @@ void ggml_metal_graph_compute(
|
|||||||
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
[encoder setBytes:&nb3 length:sizeof(nb3) atIndex:17];
|
||||||
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
|
[encoder setBytes:&sf length:sizeof(sf) atIndex:18];
|
||||||
|
|
||||||
const int nth = MIN(1024, ne0);
|
const int nth = MIN((int) ctx->pipeline_upscale_f32.maxTotalThreadsPerThreadgroup, ne0);
|
||||||
|
|
||||||
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
||||||
} break;
|
} break;
|
||||||
@ -2350,10 +2405,11 @@ void ggml_metal_graph_compute(
|
|||||||
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
||||||
if (status != MTLCommandBufferStatusCompleted) {
|
if (status != MTLCommandBufferStatusCompleted) {
|
||||||
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
||||||
GGML_ASSERT(false);
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2361,6 +2417,7 @@ void ggml_metal_graph_compute(
|
|||||||
|
|
||||||
// backend interface
|
// backend interface
|
||||||
|
|
||||||
|
// default buffer
|
||||||
static id<MTLDevice> g_backend_device = nil;
|
static id<MTLDevice> g_backend_device = nil;
|
||||||
static int g_backend_device_ref_count = 0;
|
static int g_backend_device_ref_count = 0;
|
||||||
|
|
||||||
@ -2388,34 +2445,31 @@ static void ggml_backend_metal_free_device(void) {
|
|||||||
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
return ctx->data;
|
return ctx->all_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
[ctx->metal release];
|
for (int i = 0; i < ctx->n_buffers; i++) {
|
||||||
|
[ctx->buffers[i].metal release];
|
||||||
|
}
|
||||||
ggml_backend_metal_free_device();
|
ggml_backend_metal_free_device();
|
||||||
|
|
||||||
free(ctx->data);
|
if (ctx->owned) {
|
||||||
free(ctx);
|
free(ctx->all_data);
|
||||||
|
}
|
||||||
|
|
||||||
UNUSED(buffer);
|
free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy((char *)tensor->data + offset, data, size);
|
memcpy((char *)tensor->data + offset, data, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
|
||||||
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
|
|
||||||
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
|
|
||||||
|
|
||||||
memcpy(data, (const char *)tensor->data + offset, size);
|
memcpy(data, (const char *)tensor->data + offset, size);
|
||||||
|
|
||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
@ -2433,7 +2487,13 @@ static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer
|
|||||||
UNUSED(buffer);
|
UNUSED(buffer);
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
|
||||||
|
|
||||||
|
memset(ctx->all_data, value, ctx->all_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
|
||||||
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
|
||||||
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
/* .get_base = */ ggml_backend_metal_buffer_get_base,
|
||||||
/* .init_tensor = */ NULL,
|
/* .init_tensor = */ NULL,
|
||||||
@ -2441,8 +2501,11 @@ static struct ggml_backend_buffer_i metal_backend_buffer_i = {
|
|||||||
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
/* .get_tensor = */ ggml_backend_metal_buffer_get_tensor,
|
||||||
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
/* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from,
|
||||||
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
/* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to,
|
||||||
|
/* .clear = */ ggml_backend_metal_buffer_clear,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// default buffer type
|
||||||
|
|
||||||
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||||
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
@ -2453,13 +2516,46 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba
|
|||||||
size_aligned += (size_page - (size_aligned % size_page));
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
}
|
}
|
||||||
|
|
||||||
ctx->data = ggml_metal_host_malloc(size);
|
id<MTLDevice> device = ggml_backend_metal_get_device();
|
||||||
ctx->metal = [ggml_backend_metal_get_device() newBufferWithBytesNoCopy:ctx->data
|
|
||||||
|
ctx->all_data = ggml_metal_host_malloc(size_aligned);
|
||||||
|
ctx->all_size = size_aligned;
|
||||||
|
ctx->owned = true;
|
||||||
|
ctx->n_buffers = 1;
|
||||||
|
|
||||||
|
ctx->buffers[0].data = ctx->all_data;
|
||||||
|
ctx->buffers[0].size = size;
|
||||||
|
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
||||||
length:size_aligned
|
length:size_aligned
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
|
|
||||||
return ggml_backend_buffer_init(buft, metal_backend_buffer_i, ctx, size);
|
if (ctx->buffers[0].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
free(ctx);
|
||||||
|
ggml_backend_metal_free_device();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||||
|
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
|
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
||||||
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
|
} else {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(buft, ggml_backend_metal_buffer_i, ctx, size);
|
||||||
}
|
}
|
||||||
|
|
||||||
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
|
||||||
@ -2470,7 +2566,13 @@ static size_t ggml_backend_metal_buffer_type_get_alignment(ggml_backend_buffer_t
|
|||||||
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
|
||||||
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
|
||||||
|
|
||||||
GGML_UNUSED(buft);
|
UNUSED(buft);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
|
||||||
|
return true;
|
||||||
|
|
||||||
|
UNUSED(buft);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
||||||
@ -2480,6 +2582,7 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||||||
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
|
||||||
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
|
||||||
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
|
||||||
|
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
|
||||||
},
|
},
|
||||||
/* .context = */ NULL,
|
/* .context = */ NULL,
|
||||||
};
|
};
|
||||||
@ -2487,6 +2590,87 @@ ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) {
|
|||||||
return &ggml_backend_buffer_type_metal;
|
return &ggml_backend_buffer_type_metal;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// buffer from ptr
|
||||||
|
|
||||||
|
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
|
||||||
|
struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context));
|
||||||
|
|
||||||
|
ctx->all_data = data;
|
||||||
|
ctx->all_size = size;
|
||||||
|
ctx->owned = false;
|
||||||
|
ctx->n_buffers = 0;
|
||||||
|
|
||||||
|
const size_t size_page = sysconf(_SC_PAGESIZE);
|
||||||
|
size_t size_aligned = size;
|
||||||
|
if ((size_aligned % size_page) != 0) {
|
||||||
|
size_aligned += (size_page - (size_aligned % size_page));
|
||||||
|
}
|
||||||
|
|
||||||
|
id<MTLDevice> device = ggml_backend_metal_get_device();
|
||||||
|
|
||||||
|
// the buffer fits into the max buffer size allowed by the device
|
||||||
|
if (size_aligned <= device.maxBufferLength) {
|
||||||
|
ctx->buffers[ctx->n_buffers].data = data;
|
||||||
|
ctx->buffers[ctx->n_buffers].size = size;
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
++ctx->n_buffers;
|
||||||
|
} else {
|
||||||
|
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
||||||
|
// one of the views
|
||||||
|
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
||||||
|
const size_t size_step = device.maxBufferLength - size_ovlp;
|
||||||
|
const size_t size_view = device.maxBufferLength;
|
||||||
|
|
||||||
|
for (size_t i = 0; i < size; i += size_step) {
|
||||||
|
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
||||||
|
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
||||||
|
|
||||||
|
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
||||||
|
|
||||||
|
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, offs = %12ld", __func__, size_step_aligned / 1024.0 / 1024.0, i);
|
||||||
|
if (i + size_step < size) {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
++ctx->n_buffers;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f / %8.2f)",
|
||||||
|
device.currentAllocatedSize / 1024.0 / 1024.0,
|
||||||
|
device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
||||||
|
|
||||||
|
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
||||||
|
GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
||||||
|
} else {
|
||||||
|
GGML_METAL_LOG_INFO("\n");
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
GGML_METAL_LOG_INFO(", (%8.2f)\n", device.currentAllocatedSize / 1024.0 / 1024.0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
|
||||||
|
}
|
||||||
|
|
||||||
|
// backend
|
||||||
|
|
||||||
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
static const char * ggml_backend_metal_name(ggml_backend_t backend) {
|
||||||
return "Metal";
|
return "Metal";
|
||||||
|
|
||||||
@ -2499,20 +2683,16 @@ static void ggml_backend_metal_free(ggml_backend_t backend) {
|
|||||||
free(backend);
|
free(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
|
|
||||||
UNUSED(backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
|
||||||
return ggml_backend_metal_buffer_type();
|
return ggml_backend_metal_buffer_type();
|
||||||
|
|
||||||
UNUSED(backend);
|
UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
||||||
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
||||||
|
|
||||||
ggml_metal_graph_compute(metal_ctx, cgraph);
|
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|
||||||
@ -2529,25 +2709,15 @@ static struct ggml_backend_i metal_backend_i = {
|
|||||||
/* .get_tensor_async = */ NULL,
|
/* .get_tensor_async = */ NULL,
|
||||||
/* .cpy_tensor_from_async = */ NULL,
|
/* .cpy_tensor_from_async = */ NULL,
|
||||||
/* .cpy_tensor_to_async = */ NULL,
|
/* .cpy_tensor_to_async = */ NULL,
|
||||||
/* .synchronize = */ ggml_backend_metal_synchronize,
|
/* .synchronize = */ NULL,
|
||||||
/* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
|
/* .graph_plan_create = */ NULL,
|
||||||
/* .graph_plan_free = */ NULL,
|
/* .graph_plan_free = */ NULL,
|
||||||
/* .graph_plan_compute = */ NULL,
|
/* .graph_plan_compute = */ NULL,
|
||||||
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
/* .graph_compute = */ ggml_backend_metal_graph_compute,
|
||||||
/* .supports_op = */ ggml_backend_metal_supports_op,
|
/* .supports_op = */ ggml_backend_metal_supports_op,
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: make a common log callback for all backends in ggml-backend
|
|
||||||
static void ggml_backend_log_callback(enum ggml_log_level level, const char * msg, void * user_data) {
|
|
||||||
fprintf(stderr, "%s", msg);
|
|
||||||
|
|
||||||
UNUSED(level);
|
|
||||||
UNUSED(user_data);
|
|
||||||
}
|
|
||||||
|
|
||||||
ggml_backend_t ggml_backend_metal_init(void) {
|
ggml_backend_t ggml_backend_metal_init(void) {
|
||||||
ggml_metal_log_set_callback(ggml_backend_log_callback, NULL);
|
|
||||||
|
|
||||||
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
struct ggml_metal_context * ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
|
||||||
|
|
||||||
if (ctx == NULL) {
|
if (ctx == NULL) {
|
||||||
|
722
ggml-metal.metal
722
ggml-metal.metal
File diff suppressed because it is too large
Load Diff
@ -6,19 +6,19 @@
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
void ggml_cl_init(void);
|
GGML_API void ggml_cl_init(void);
|
||||||
|
|
||||||
void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
||||||
|
|
||||||
void * ggml_cl_host_malloc(size_t size);
|
GGML_API void * ggml_cl_host_malloc(size_t size);
|
||||||
void ggml_cl_host_free(void * ptr);
|
GGML_API void ggml_cl_host_free(void * ptr);
|
||||||
|
|
||||||
void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor);
|
||||||
|
|
||||||
void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor);
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
||||||
|
481
ggml-quants.c
481
ggml-quants.c
@ -407,6 +407,22 @@ inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
|||||||
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if !defined(__ARM_FEATURE_DOTPROD)
|
||||||
|
|
||||||
|
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
||||||
|
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
||||||
|
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
||||||
|
|
||||||
|
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
|
||||||
|
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
||||||
@ -2468,32 +2484,12 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx,
|
|||||||
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
||||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
// dot product into int32x4_t
|
// dot product into int32x4_t
|
||||||
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0l), v0_0hs, v1_0h);
|
||||||
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1l), v0_1hs, v1_1h);
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
#else
|
|
||||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0l));
|
|
||||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0l));
|
|
||||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0h));
|
|
||||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0h));
|
|
||||||
|
|
||||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1l));
|
|
||||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1l));
|
|
||||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1h));
|
|
||||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1h));
|
|
||||||
|
|
||||||
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
|
||||||
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
|
||||||
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
|
||||||
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||||
@ -2776,32 +2772,12 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri
|
|||||||
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
||||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
// dot product into int32x4_t
|
// dot product into int32x4_t
|
||||||
const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
const int32x4_t p_0 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_0l, v1_0l), v0_0h, v1_0h);
|
||||||
const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
const int32x4_t p_1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), v0_1l, v1_1l), v0_1h, v1_1h);
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||||
#else
|
|
||||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0l), vget_low_s8 (v1_0l));
|
|
||||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0l), vget_high_s8(v1_0l));
|
|
||||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0h), vget_low_s8 (v1_0h));
|
|
||||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0h), vget_high_s8(v1_0h));
|
|
||||||
|
|
||||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1l), vget_low_s8 (v1_1l));
|
|
||||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1l), vget_high_s8(v1_1l));
|
|
||||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1h), vget_low_s8 (v1_1h));
|
|
||||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1h), vget_high_s8(v1_1h));
|
|
||||||
|
|
||||||
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
|
||||||
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
|
||||||
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
|
||||||
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs;
|
||||||
@ -2963,32 +2939,12 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri
|
|||||||
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
||||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
#else
|
|
||||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
|
||||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
|
||||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
|
|
||||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
|
|
||||||
|
|
||||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
|
|
||||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
|
|
||||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
|
|
||||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
|
|
||||||
|
|
||||||
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
|
||||||
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
|
||||||
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
|
||||||
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||||
@ -3275,32 +3231,12 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri
|
|||||||
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
const int8x16_t v1_1l = vld1q_s8(y1->qs);
|
||||||
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0lf, v1_0l),
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_0hf, v1_0h))), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1lf, v1_1l),
|
||||||
vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
ggml_vdotq_s32(vdupq_n_s32(0), v0_1hf, v1_1h))), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
||||||
#else
|
|
||||||
const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lf), vget_low_s8 (v1_0l));
|
|
||||||
const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lf), vget_high_s8(v1_0l));
|
|
||||||
const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hf), vget_low_s8 (v1_0h));
|
|
||||||
const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hf), vget_high_s8(v1_0h));
|
|
||||||
|
|
||||||
const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lf), vget_low_s8 (v1_1l));
|
|
||||||
const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lf), vget_high_s8(v1_1l));
|
|
||||||
const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hf), vget_low_s8 (v1_1h));
|
|
||||||
const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hf), vget_high_s8(v1_1h));
|
|
||||||
|
|
||||||
const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
|
|
||||||
const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
|
|
||||||
const int32x4_t pl1 = vaddq_s32(vpaddlq_s16(pl1l), vpaddlq_s16(pl1h));
|
|
||||||
const int32x4_t ph1 = vaddq_s32(vpaddlq_s16(ph1l), vpaddlq_s16(ph1h));
|
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(pl0, ph0)), GGML_FP16_TO_FP32(x0->d)*y0->d);
|
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(pl1, ph1)), GGML_FP16_TO_FP32(x1->d)*y1->d);
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1) + summs0 + summs1;
|
||||||
@ -3550,34 +3486,13 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri
|
|||||||
const int8x16_t y1_0 = vld1q_s8(y1->qs);
|
const int8x16_t y1_0 = vld1q_s8(y1->qs);
|
||||||
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
|
const int8x16_t y1_1 = vld1q_s8(y1->qs + 16);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
ggml_vdotq_s32(vdupq_n_s32(0), x0_0, y0_0),
|
||||||
vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
ggml_vdotq_s32(vdupq_n_s32(0), x0_1, y0_1))), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
||||||
|
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(
|
||||||
vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
ggml_vdotq_s32(vdupq_n_s32(0), x1_0, y1_0),
|
||||||
vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
ggml_vdotq_s32(vdupq_n_s32(0), x1_1, y1_1))), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
||||||
|
|
||||||
#else
|
|
||||||
const int16x8_t p0_0 = vmull_s8(vget_low_s8 (x0_0), vget_low_s8 (y0_0));
|
|
||||||
const int16x8_t p0_1 = vmull_s8(vget_high_s8(x0_0), vget_high_s8(y0_0));
|
|
||||||
const int16x8_t p0_2 = vmull_s8(vget_low_s8 (x0_1), vget_low_s8 (y0_1));
|
|
||||||
const int16x8_t p0_3 = vmull_s8(vget_high_s8(x0_1), vget_high_s8(y0_1));
|
|
||||||
|
|
||||||
const int16x8_t p1_0 = vmull_s8(vget_low_s8 (x1_0), vget_low_s8 (y1_0));
|
|
||||||
const int16x8_t p1_1 = vmull_s8(vget_high_s8(x1_0), vget_high_s8(y1_0));
|
|
||||||
const int16x8_t p1_2 = vmull_s8(vget_low_s8 (x1_1), vget_low_s8 (y1_1));
|
|
||||||
const int16x8_t p1_3 = vmull_s8(vget_high_s8(x1_1), vget_high_s8(y1_1));
|
|
||||||
|
|
||||||
const int32x4_t p0 = vaddq_s32(vpaddlq_s16(p0_0), vpaddlq_s16(p0_1));
|
|
||||||
const int32x4_t p1 = vaddq_s32(vpaddlq_s16(p0_2), vpaddlq_s16(p0_3));
|
|
||||||
const int32x4_t p2 = vaddq_s32(vpaddlq_s16(p1_0), vpaddlq_s16(p1_1));
|
|
||||||
const int32x4_t p3 = vaddq_s32(vpaddlq_s16(p1_2), vpaddlq_s16(p1_3));
|
|
||||||
|
|
||||||
sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vaddq_s32(p0, p1)), GGML_FP16_TO_FP32(x0->d)*GGML_FP16_TO_FP32(y0->d));
|
|
||||||
sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vaddq_s32(p2, p3)), GGML_FP16_TO_FP32(x1->d)*GGML_FP16_TO_FP32(y1->d));
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
*s = vaddvq_f32(sumv0) + vaddvq_f32(sumv1);
|
||||||
@ -3650,12 +3565,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
||||||
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
#endif
|
|
||||||
|
|
||||||
ggml_int8x16x2_t q2bytes;
|
ggml_int8x16x2_t q2bytes;
|
||||||
uint8_t aux[16];
|
uint8_t aux[16];
|
||||||
@ -3663,7 +3576,6 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
float sum = 0;
|
float sum = 0;
|
||||||
|
|
||||||
for (int i = 0; i < nb; ++i) {
|
for (int i = 0; i < nb; ++i) {
|
||||||
|
|
||||||
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
|
||||||
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin);
|
||||||
|
|
||||||
@ -3677,7 +3589,7 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
const uint8x16_t mins = vshrq_n_u8(mins_and_scales, 4);
|
||||||
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
||||||
const ggml_int16x8x2_t mins16 = {vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))};
|
const ggml_int16x8x2_t mins16 = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mins))), vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mins)))}};
|
||||||
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
const int32x4_t s0 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[0]), vget_low_s16 (q8sums.val[0])),
|
||||||
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
vmull_s16(vget_high_s16(mins16.val[0]), vget_high_s16(q8sums.val[0])));
|
||||||
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
const int32x4_t s1 = vaddq_s32(vmull_s16(vget_low_s16 (mins16.val[1]), vget_low_s16 (q8sums.val[1])),
|
||||||
@ -3689,20 +3601,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
// We use this macro instead of a function call because for some reason
|
// We use this macro instead of a function call because for some reason
|
||||||
// the code runs 2-3% slower, even if the function is declared inline
|
// the code runs 2-3% slower, even if the function is declared inline
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * aux[is+(index)];\
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * aux[is+1+(index)];
|
||||||
#else
|
|
||||||
#define MULTIPLY_ACCUM_WITH_SCALE(index)\
|
|
||||||
{\
|
|
||||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),\
|
|
||||||
vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));\
|
|
||||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),\
|
|
||||||
vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));\
|
|
||||||
isum += vaddvq_s16(p1) * aux[is+(index)] + vaddvq_s16(p2) * aux[is+1+(index)];\
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
#define SHIFT_MULTIPLY_ACCUM_WITH_SCALE(shift, index)\
|
||||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;\
|
||||||
@ -3710,26 +3611,23 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits.val[1], (shift)), m3));\
|
||||||
MULTIPLY_ACCUM_WITH_SCALE((index));
|
MULTIPLY_ACCUM_WITH_SCALE((index));
|
||||||
|
|
||||||
|
|
||||||
for (int j = 0; j < QK_K/128; ++j) {
|
for (int j = 0; j < QK_K/128; ++j) {
|
||||||
|
|
||||||
const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
|
const ggml_uint8x16x2_t q2bits = ggml_vld1q_u8_x2(q2); q2 += 32;
|
||||||
|
|
||||||
ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
ggml_int8x16x2_t q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||||
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
|
q2bytes.val[0] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[0], m3));
|
||||||
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
|
q2bytes.val[1] = vreinterpretq_s8_u8(vandq_u8(q2bits.val[1], m3));
|
||||||
|
|
||||||
MULTIPLY_ACCUM_WITH_SCALE(0);
|
MULTIPLY_ACCUM_WITH_SCALE(0);
|
||||||
|
|
||||||
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
|
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(2, 2);
|
||||||
|
|
||||||
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
|
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(4, 4);
|
||||||
|
|
||||||
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
|
SHIFT_MULTIPLY_ACCUM_WITH_SCALE(6, 6);
|
||||||
|
|
||||||
is += 8;
|
is += 8;
|
||||||
}
|
}
|
||||||
sum += d * isum;
|
|
||||||
|
|
||||||
|
sum += d * isum;
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = sum;
|
*s = sum;
|
||||||
@ -4043,11 +3941,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
#endif
|
|
||||||
|
|
||||||
ggml_int8x16x4_t q2bytes;
|
ggml_int8x16x4_t q2bytes;
|
||||||
|
|
||||||
@ -4081,28 +3977,12 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
|
q2bytes.val[2] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 4), m3));
|
||||||
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
|
q2bytes.val[3] = vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q2bits, 6), m3));
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||||
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[0], q8bytes.val[0])) * scales[0];
|
isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
||||||
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[1], q8bytes.val[1])) * scales[1];
|
isum1 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
||||||
isum1 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[2], q8bytes.val[2])) * scales[2];
|
isum2 += vaddvq_s32(ggml_vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||||
isum2 += vaddvq_s32(vdotq_s32(vzero, q2bytes.val[3], q8bytes.val[3])) * scales[3];
|
|
||||||
#else
|
|
||||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q2bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
|
||||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q2bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
|
||||||
isum1 += vaddvq_s16(p1) * scales[0];
|
|
||||||
isum2 += vaddvq_s16(p2) * scales[1];
|
|
||||||
|
|
||||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
|
||||||
vmull_s8(vget_high_s8(q2bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
|
||||||
const int16x8_t p4 = vaddq_s16(vmull_s8(vget_low_s8 (q2bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
|
||||||
vmull_s8(vget_high_s8(q2bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
|
||||||
isum1 += vaddvq_s16(p3) * scales[2];
|
|
||||||
isum2 += vaddvq_s16(p4) * scales[3];
|
|
||||||
#endif
|
|
||||||
sum += d * (isum1 + isum2);
|
sum += d * (isum1 + isum2);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = sum;
|
*s = sum;
|
||||||
@ -4328,9 +4208,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
uint32_t utmp[4];
|
uint32_t utmp[4];
|
||||||
|
|
||||||
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
||||||
#ifdef __ARM_FEATURE_DOTPROD
|
|
||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
#endif
|
|
||||||
|
|
||||||
const uint8x16_t m0 = vdupq_n_u8(1);
|
const uint8x16_t m0 = vdupq_n_u8(1);
|
||||||
const uint8x16_t m1 = vshlq_n_u8(m0, 1);
|
const uint8x16_t m1 = vshlq_n_u8(m0, 1);
|
||||||
@ -4382,22 +4260,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 2), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
||||||
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 2), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_1.val[0])) * scale[0];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_1.val[1])) * scale[1];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_1.val[2])) * scale[2];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_1.val[3])) * scale[3];
|
|
||||||
#else
|
|
||||||
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_1.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_1.val[0])));
|
|
||||||
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_1.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_1.val[1])));
|
|
||||||
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_1.val[2])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_1.val[2])));
|
|
||||||
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_1.val[3])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_1.val[3])));
|
|
||||||
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
|
|
||||||
#endif
|
|
||||||
scale += 4;
|
scale += 4;
|
||||||
|
|
||||||
q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
|
q3h.val[0] = vbicq_u8(m2, qhbits.val[0]);
|
||||||
@ -4410,22 +4277,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
q3bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[0], 6), m3b)), vreinterpretq_s8_u8(q3h.val[2]));
|
||||||
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
q3bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vandq_u8(vshrq_n_u8(q3bits.val[1], 6), m3b)), vreinterpretq_s8_u8(q3h.val[3]));
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes_2.val[0])) * scale[0];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes_2.val[1])) * scale[1];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes_2.val[2])) * scale[2];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes_2.val[3])) * scale[3];
|
|
||||||
#else
|
|
||||||
p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes_2.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes_2.val[0])));
|
|
||||||
p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes_2.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes_2.val[1])));
|
|
||||||
p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes_2.val[2])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes_2.val[2])));
|
|
||||||
p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes_2.val[3])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes_2.val[3])));
|
|
||||||
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1] + vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
|
|
||||||
#endif
|
|
||||||
scale += 4;
|
scale += 4;
|
||||||
|
|
||||||
if (j == 0) {
|
if (j == 0) {
|
||||||
@ -4864,10 +4720,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
#ifdef __ARM_FEATURE_DOTPROD
|
|
||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
const uint8x16_t m3b = vdupq_n_u8(0x3);
|
||||||
const uint8x16_t mh = vdupq_n_u8(4);
|
const uint8x16_t mh = vdupq_n_u8(4);
|
||||||
@ -4908,22 +4761,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
|
q3bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(vshrq_n_u8(q3bits, 4), m3b), q3h.val[2]));
|
||||||
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
|
q3bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q3bits, 6), q3h.val[3]));
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[0], q8bytes.val[0])) * scales[0];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[1], q8bytes.val[1])) * scales[2];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[2], q8bytes.val[2])) * scales[1];
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q3bytes.val[3], q8bytes.val[3])) * scales[3];
|
|
||||||
#else
|
|
||||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
|
||||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
|
||||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
|
||||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q3bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
|
||||||
vmull_s8(vget_high_s8(q3bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
|
||||||
isum += vaddvq_s16(p0) * scales[0] + vaddvq_s16(p1) * scales[2] + vaddvq_s16(p2) * scales[1] + vaddvq_s16(p3) * scales[3];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
sum += d * isum;
|
sum += d * isum;
|
||||||
|
|
||||||
@ -5228,11 +5069,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
uint32_t utmp[4];
|
uint32_t utmp[4];
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||||
#ifdef __ARM_FEATURE_DOTPROD
|
|
||||||
const int32x4_t mzero = vdupq_n_s32(0);
|
const int32x4_t mzero = vdupq_n_s32(0);
|
||||||
#endif
|
|
||||||
|
|
||||||
ggml_int8x16x2_t q4bytes;
|
ggml_int8x16x2_t q4bytes;
|
||||||
ggml_int8x16x2_t q8bytes;
|
ggml_int8x16x2_t q8bytes;
|
||||||
@ -5269,44 +5107,22 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
int32_t sumi2 = 0;
|
int32_t sumi2 = 0;
|
||||||
|
|
||||||
for (int j = 0; j < QK_K/64; ++j) {
|
for (int j = 0; j < QK_K/64; ++j) {
|
||||||
|
|
||||||
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4); q4 += 32;
|
||||||
|
|
||||||
#ifdef __ARM_FEATURE_DOTPROD
|
|
||||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||||
|
|
||||||
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||||
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
sumi1 += vaddvq_s32(p1) * scales[2*j+0];
|
||||||
|
|
||||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||||
|
|
||||||
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||||
|
|
||||||
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
sumi2 += vaddvq_s32(p2) * scales[2*j+1];
|
||||||
#else
|
|
||||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
|
||||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
|
||||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
|
||||||
sumi1 += vaddvq_s16(vaddq_s16(p0, p1)) * scales[2*j+0];
|
|
||||||
|
|
||||||
q8bytes = ggml_vld1q_s8_x2(q8); q8 += 32;
|
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
|
||||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
|
||||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
|
||||||
sumi2 += vaddvq_s16(vaddq_s16(p2, p3)) * scales[2*j+1];
|
|
||||||
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sumf += d * (sumi1 + sumi2);
|
sumf += d * (sumi1 + sumi2);
|
||||||
@ -5603,12 +5419,9 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||||
|
|
||||||
#ifdef __ARM_FEATURE_DOTPROD
|
|
||||||
const int32x4_t mzero = vdupq_n_s32(0);
|
const int32x4_t mzero = vdupq_n_s32(0);
|
||||||
#endif
|
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
||||||
@ -5636,41 +5449,20 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
const ggml_uint8x16x2_t q4bits = ggml_vld1q_u8_x2(q4);
|
||||||
|
|
||||||
#ifdef __ARM_FEATURE_DOTPROD
|
|
||||||
q8bytes = ggml_vld1q_s8_x4(q8);
|
q8bytes = ggml_vld1q_s8_x4(q8);
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
||||||
|
|
||||||
const int32x4_t p1 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[0]), q4bytes.val[1], q8bytes.val[1]);
|
||||||
const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
|
const int32_t sumi1 = vaddvq_s32(p1) * scales[0];
|
||||||
|
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
||||||
|
|
||||||
const int32x4_t p2 = vdotq_s32(vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(mzero, q4bytes.val[0], q8bytes.val[2]), q4bytes.val[1], q8bytes.val[3]);
|
||||||
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
const int32_t sumi2 = vaddvq_s32(p2) * scales[1];
|
||||||
|
|
||||||
#else
|
|
||||||
q8bytes = ggml_vld1q_s8_x4(q8);
|
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[0], m4b));
|
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vandq_u8 (q4bits.val[1], m4b));
|
|
||||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
|
||||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
|
||||||
int32_t sumi1 = vaddvq_s16(vaddq_s16(p0, p1)) * scales[0];
|
|
||||||
|
|
||||||
q4bytes.val[0] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[0], 4));
|
|
||||||
q4bytes.val[1] = vreinterpretq_s8_u8(vshrq_n_u8(q4bits.val[1], 4));
|
|
||||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[0]), vget_low_s8 (q8bytes.val[2])),
|
|
||||||
vmull_s8(vget_high_s8(q4bytes.val[0]), vget_high_s8(q8bytes.val[2])));
|
|
||||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q4bytes.val[1]), vget_low_s8 (q8bytes.val[3])),
|
|
||||||
vmull_s8(vget_high_s8(q4bytes.val[1]), vget_high_s8(q8bytes.val[3])));
|
|
||||||
int32_t sumi2 = vaddvq_s16(vaddq_s16(p2, p3)) * scales[1];
|
|
||||||
|
|
||||||
#endif
|
|
||||||
sumf += d * (sumi1 + sumi2);
|
sumf += d * (sumi1 + sumi2);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = sumf - sum_mins;
|
*s = sumf - sum_mins;
|
||||||
@ -5875,15 +5667,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
uint32_t utmp[4];
|
uint32_t utmp[4];
|
||||||
|
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||||
const uint8x16_t mone = vdupq_n_u8(1);
|
const uint8x16_t mone = vdupq_n_u8(1);
|
||||||
const uint8x16_t mtwo = vdupq_n_u8(2);
|
const uint8x16_t mtwo = vdupq_n_u8(2);
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
const int32x4_t mzero = vdupq_n_s32(0);
|
const int32x4_t mzero = vdupq_n_s32(0);
|
||||||
#endif
|
|
||||||
|
|
||||||
ggml_int8x16x4_t q5bytes;
|
ggml_int8x16x4_t q5bytes;
|
||||||
|
|
||||||
@ -5938,28 +5726,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
|
q5bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[0], 4), q5h.val[2]));
|
||||||
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
|
q5bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q5bits.val[1], 4), q5h.val[3]));
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
||||||
|
sumi += vaddvq_s32(ggml_vdotq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
||||||
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]), q5bytes.val[1], q8bytes.val[1])) * *scales++;
|
|
||||||
sumi += vaddvq_s32(vdotq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]), q5bytes.val[3], q8bytes.val[3])) * *scales++;
|
|
||||||
#else
|
|
||||||
|
|
||||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
|
||||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
|
||||||
sumi += vaddvq_s16(vaddq_s16(p0, p1)) * *scales++;
|
|
||||||
|
|
||||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
|
||||||
vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
|
||||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
|
||||||
vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
|
||||||
sumi += vaddvq_s16(vaddq_s16(p2, p3)) * *scales++;
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
sumf += d * sumi - dmin * sumi_mins;
|
sumf += d * sumi - dmin * sumi_mins;
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
@ -6311,12 +6082,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
||||||
const uint8x16_t mh = vdupq_n_u8(16);
|
const uint8x16_t mh = vdupq_n_u8(16);
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
const int32x4_t mzero = vdupq_n_s32(0);
|
const int32x4_t mzero = vdupq_n_s32(0);
|
||||||
#endif
|
|
||||||
|
|
||||||
ggml_int8x16x4_t q5bytes;
|
ggml_int8x16x4_t q5bytes;
|
||||||
ggml_uint8x16x4_t q5h;
|
ggml_uint8x16x4_t q5h;
|
||||||
@ -6348,32 +6116,12 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
|
q5bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[0], 4)), vreinterpretq_s8_u8(q5h.val[2]));
|
||||||
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
|
q5bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vshrq_n_u8(q5bits.val[1], 4)), vreinterpretq_s8_u8(q5h.val[3]));
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
int32_t sumi1 = sc[0] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
||||||
|
int32_t sumi2 = sc[1] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
||||||
int32_t sumi1 = sc[0] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[0], q8bytes.val[0]));
|
int32_t sumi3 = sc[2] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
||||||
int32_t sumi2 = sc[1] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[1], q8bytes.val[1]));
|
int32_t sumi4 = sc[3] * vaddvq_s32(ggml_vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
||||||
int32_t sumi3 = sc[2] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[2], q8bytes.val[2]));
|
|
||||||
int32_t sumi4 = sc[3] * vaddvq_s32(vdotq_s32(mzero, q5bytes.val[3], q8bytes.val[3]));
|
|
||||||
|
|
||||||
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
sumf += d * (sumi1 + sumi2 + sumi3 + sumi4);
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
const int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q5bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
|
||||||
const int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q5bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
|
||||||
int32_t sumi = sc[0] * vaddvq_s16(p0) + sc[1] * vaddvq_s16(p1);
|
|
||||||
|
|
||||||
const int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
|
||||||
vmull_s8(vget_high_s8(q5bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
|
||||||
const int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q5bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
|
||||||
vmull_s8(vget_high_s8(q5bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
|
||||||
sumi += sc[2] * vaddvq_s16(p2) + sc[3] * vaddvq_s16(p3);
|
|
||||||
|
|
||||||
sumf += d*sumi;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
@ -6600,13 +6348,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
#endif
|
|
||||||
//const int8x16_t m32s = vdupq_n_s8(32);
|
//const int8x16_t m32s = vdupq_n_s8(32);
|
||||||
|
|
||||||
const uint8x16_t mone = vdupq_n_u8(3);
|
const uint8x16_t mone = vdupq_n_u8(3);
|
||||||
@ -6626,7 +6371,7 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
|
|
||||||
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums);
|
||||||
const int8x16_t scales = vld1q_s8(scale);
|
const int8x16_t scales = vld1q_s8(scale);
|
||||||
const ggml_int16x8x2_t q6scales = {vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))};
|
const ggml_int16x8x2_t q6scales = {{vmovl_s8(vget_low_s8(scales)), vmovl_s8(vget_high_s8(scales))}};
|
||||||
|
|
||||||
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
const int32x4_t prod = vaddq_s32(vaddq_s32(vmull_s16(vget_low_s16 (q8sums.val[0]), vget_low_s16 (q6scales.val[0])),
|
||||||
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
vmull_s16(vget_high_s16(q8sums.val[0]), vget_high_s16(q6scales.val[0]))),
|
||||||
@ -6658,31 +6403,13 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
|
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[2], m4b), q6h.val[2]));
|
||||||
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
|
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vandq_u8(q6bits.val[3], m4b), q6h.val[3]));
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||||
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||||
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||||
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||||
|
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
|
||||||
scale += 4;
|
scale += 4;
|
||||||
|
|
||||||
#else
|
|
||||||
|
|
||||||
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
|
||||||
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
|
||||||
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
|
|
||||||
scale += 2;
|
|
||||||
|
|
||||||
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
|
||||||
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
|
||||||
isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
|
|
||||||
scale += 2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
q8bytes = ggml_vld1q_s8_x4(q8); q8 += 64;
|
||||||
|
|
||||||
shifted = vshrq_n_u8(qhbits.val[0], 4);
|
shifted = vshrq_n_u8(qhbits.val[0], 4);
|
||||||
@ -6703,34 +6430,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
|
q6bytes.val[2] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[2], 4), q6h.val[2]));
|
||||||
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
|
q6bytes.val[3] = vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[3], 4), q6h.val[3]));
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||||
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
|
||||||
scale += 4;
|
scale += 4;
|
||||||
|
|
||||||
//for (int l = 0; l < 4; ++l) {
|
|
||||||
// const int32x4_t p = vdotq_s32(vzero, q6bytes.val[l], q8bytes.val[l]);
|
|
||||||
// isum += vaddvq_s32(p) * *scale++;
|
|
||||||
//}
|
|
||||||
#else
|
|
||||||
p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
|
||||||
p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
|
||||||
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
|
|
||||||
scale += 2;
|
|
||||||
|
|
||||||
p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
|
||||||
p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
|
||||||
isum += vaddvq_s16(p2) * scale[0] + vaddvq_s16(p3) * scale[1];
|
|
||||||
scale += 2;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
}
|
}
|
||||||
//sum += isum * d_all * y[i].d;
|
//sum += isum * d_all * y[i].d;
|
||||||
sum += d_all * y[i].d * (isum - 32 * isum_mins);
|
sum += d_all * y[i].d * (isum - 32 * isum_mins);
|
||||||
@ -7076,14 +6780,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
const int nb = n / QK_K;
|
const int nb = n / QK_K;
|
||||||
|
|
||||||
#ifdef __ARM_NEON
|
#ifdef __ARM_NEON
|
||||||
|
|
||||||
float sum = 0;
|
float sum = 0;
|
||||||
|
|
||||||
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
||||||
const int8x16_t m32s = vdupq_n_s8(32);
|
const int8x16_t m32s = vdupq_n_s8(32);
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
|
||||||
const int32x4_t vzero = vdupq_n_s32(0);
|
const int32x4_t vzero = vdupq_n_s32(0);
|
||||||
#endif
|
|
||||||
|
|
||||||
const uint8x16_t mone = vdupq_n_u8(3);
|
const uint8x16_t mone = vdupq_n_u8(3);
|
||||||
|
|
||||||
@ -7119,26 +6820,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|||||||
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
|
q6bytes.val[2] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[0], 4), q6h.val[2])), m32s);
|
||||||
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
|
q6bytes.val[3] = vsubq_s8(vreinterpretq_s8_u8(vorrq_u8(vshrq_n_u8(q6bits.val[1], 4), q6h.val[3])), m32s);
|
||||||
|
|
||||||
#if defined(__ARM_FEATURE_DOTPROD)
|
isum += vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
||||||
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
||||||
isum += vaddvq_s32(vdotq_s32(vzero, q6bytes.val[0], q8bytes.val[0])) * scale[0] +
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[1], q8bytes.val[1])) * scale[1] +
|
vaddvq_s32(ggml_vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[2], q8bytes.val[2])) * scale[2] +
|
|
||||||
vaddvq_s32(vdotq_s32(vzero, q6bytes.val[3], q8bytes.val[3])) * scale[3];
|
|
||||||
#else
|
|
||||||
|
|
||||||
int16x8_t p0 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[0]), vget_low_s8 (q8bytes.val[0])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[0]), vget_high_s8(q8bytes.val[0])));
|
|
||||||
int16x8_t p1 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[1]), vget_low_s8 (q8bytes.val[1])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[1]), vget_high_s8(q8bytes.val[1])));
|
|
||||||
isum += vaddvq_s16(p0) * scale[0] + vaddvq_s16(p1) * scale[1];
|
|
||||||
|
|
||||||
int16x8_t p2 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[2]), vget_low_s8 (q8bytes.val[2])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[2]), vget_high_s8(q8bytes.val[2])));
|
|
||||||
int16x8_t p3 = vaddq_s16(vmull_s8(vget_low_s8 (q6bytes.val[3]), vget_low_s8 (q8bytes.val[3])),
|
|
||||||
vmull_s8(vget_high_s8(q6bytes.val[3]), vget_high_s8(q8bytes.val[3])));
|
|
||||||
isum += vaddvq_s16(p2) * scale[2] + vaddvq_s16(p3) * scale[3];
|
|
||||||
#endif
|
|
||||||
|
|
||||||
sum += isum * d_all * y[i].d;
|
sum += isum * d_all * y[i].d;
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ static_assert(sizeof(block_q8_1) == 2*sizeof(float) + QK8_1, "wrong q8_1 block s
|
|||||||
// 2-bit quantization
|
// 2-bit quantization
|
||||||
// weight is represented as x = a * q + b
|
// weight is represented as x = a * q + b
|
||||||
// 16 blocks of 16 elements each
|
// 16 blocks of 16 elements each
|
||||||
// Effectively 2.5625 bits per weight
|
// Effectively 2.625 bits per weight
|
||||||
typedef struct {
|
typedef struct {
|
||||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||||
uint8_t qs[QK_K/4]; // quants
|
uint8_t qs[QK_K/4]; // quants
|
||||||
|
56
ggml.h
56
ggml.h
@ -255,6 +255,8 @@
|
|||||||
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
#define GGML_UNREACHABLE() GGML_ASSERT(!"statement should not be reached")
|
||||||
#elif defined(__GNUC__)
|
#elif defined(__GNUC__)
|
||||||
#define GGML_UNREACHABLE() __builtin_unreachable()
|
#define GGML_UNREACHABLE() __builtin_unreachable()
|
||||||
|
#elif defined(_MSC_VER)
|
||||||
|
#define GGML_UNREACHABLE() __assume(0)
|
||||||
#else
|
#else
|
||||||
#define GGML_UNREACHABLE() ((void) 0)
|
#define GGML_UNREACHABLE() ((void) 0)
|
||||||
#endif
|
#endif
|
||||||
@ -303,7 +305,7 @@ extern "C" {
|
|||||||
|
|
||||||
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
||||||
typedef half ggml_fp16_t;
|
typedef half ggml_fp16_t;
|
||||||
#elif defined(__ARM_NEON)
|
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
||||||
typedef __fp16 ggml_fp16_t;
|
typedef __fp16 ggml_fp16_t;
|
||||||
#else
|
#else
|
||||||
typedef uint16_t ggml_fp16_t;
|
typedef uint16_t ggml_fp16_t;
|
||||||
@ -343,6 +345,12 @@ extern "C" {
|
|||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// precision
|
||||||
|
enum ggml_prec {
|
||||||
|
GGML_PREC_DEFAULT,
|
||||||
|
GGML_PREC_F32,
|
||||||
|
};
|
||||||
|
|
||||||
enum ggml_backend_type {
|
enum ggml_backend_type {
|
||||||
GGML_BACKEND_CPU = 0,
|
GGML_BACKEND_CPU = 0,
|
||||||
GGML_BACKEND_GPU = 10,
|
GGML_BACKEND_GPU = 10,
|
||||||
@ -478,7 +486,8 @@ extern "C" {
|
|||||||
enum ggml_log_level {
|
enum ggml_log_level {
|
||||||
GGML_LOG_LEVEL_ERROR = 2,
|
GGML_LOG_LEVEL_ERROR = 2,
|
||||||
GGML_LOG_LEVEL_WARN = 3,
|
GGML_LOG_LEVEL_WARN = 3,
|
||||||
GGML_LOG_LEVEL_INFO = 4
|
GGML_LOG_LEVEL_INFO = 4,
|
||||||
|
GGML_LOG_LEVEL_DEBUG = 5
|
||||||
};
|
};
|
||||||
|
|
||||||
// ggml object
|
// ggml object
|
||||||
@ -502,7 +511,6 @@ extern "C" {
|
|||||||
|
|
||||||
struct ggml_backend_buffer * buffer;
|
struct ggml_backend_buffer * buffer;
|
||||||
|
|
||||||
int n_dims;
|
|
||||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||||
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
size_t nb[GGML_MAX_DIMS]; // stride in bytes:
|
||||||
// nb[0] = ggml_type_size(type)
|
// nb[0] = ggml_type_size(type)
|
||||||
@ -534,7 +542,7 @@ extern "C" {
|
|||||||
|
|
||||||
void * extra; // extra things e.g. for ggml-cuda.cu
|
void * extra; // extra things e.g. for ggml-cuda.cu
|
||||||
|
|
||||||
char padding[12];
|
char padding[8];
|
||||||
};
|
};
|
||||||
|
|
||||||
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
|
||||||
@ -639,11 +647,14 @@ extern "C" {
|
|||||||
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
GGML_API int64_t ggml_nrows (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
|
||||||
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
GGML_API size_t ggml_nbytes_pad (const struct ggml_tensor * tensor); // same as ggml_nbytes() but padded to GGML_MEM_ALIGN
|
||||||
GGML_API size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split);
|
|
||||||
|
|
||||||
GGML_API int ggml_blck_size (enum ggml_type type);
|
GGML_API int ggml_blck_size(enum ggml_type type);
|
||||||
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
|
GGML_API size_t ggml_type_size(enum ggml_type type); // size in bytes for all elements in a block
|
||||||
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
|
GGML_API size_t ggml_row_size (enum ggml_type type, int64_t ne); // size in bytes for all elements in a row
|
||||||
|
|
||||||
|
GGML_DEPRECATED(
|
||||||
|
GGML_API double ggml_type_sizef(enum ggml_type type), // ggml_type_size()/ggml_blck_size() as float
|
||||||
|
"use ggml_row_size() instead");
|
||||||
|
|
||||||
GGML_API const char * ggml_type_name(enum ggml_type type);
|
GGML_API const char * ggml_type_name(enum ggml_type type);
|
||||||
GGML_API const char * ggml_op_name (enum ggml_op op);
|
GGML_API const char * ggml_op_name (enum ggml_op op);
|
||||||
@ -662,6 +673,11 @@ extern "C" {
|
|||||||
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_transposed(const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_contiguous(const struct ggml_tensor * tensor);
|
||||||
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
GGML_API bool ggml_is_permuted (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_scalar (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_vector (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_matrix (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API bool ggml_is_3d (const struct ggml_tensor * tensor);
|
||||||
|
GGML_API int ggml_n_dims (const struct ggml_tensor * tensor); // returns 1 for scalars
|
||||||
|
|
||||||
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
GGML_API bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1);
|
||||||
|
|
||||||
@ -722,8 +738,8 @@ extern "C" {
|
|||||||
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
||||||
|
|
||||||
// Context tensor enumeration and lookup
|
// Context tensor enumeration and lookup
|
||||||
GGML_API struct ggml_tensor * ggml_get_first_tensor(struct ggml_context * ctx);
|
GGML_API struct ggml_tensor * ggml_get_first_tensor(const struct ggml_context * ctx);
|
||||||
GGML_API struct ggml_tensor * ggml_get_next_tensor (struct ggml_context * ctx, struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_get_next_tensor (const struct ggml_context * ctx, struct ggml_tensor * tensor);
|
||||||
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
|
||||||
@ -1050,6 +1066,12 @@ extern "C" {
|
|||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// change the precision of a matrix multiplication
|
||||||
|
// set to GGML_PREC_F32 for higher precision (useful for phi-2)
|
||||||
|
GGML_API void ggml_mul_mat_set_prec(
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
enum ggml_prec prec);
|
||||||
|
|
||||||
// indirect matrix multiplication
|
// indirect matrix multiplication
|
||||||
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
// ggml_mul_mat_id(ctx, as, ids, id, b) ~= ggml_mul_mat(as[ids[id]], b)
|
||||||
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
GGML_API struct ggml_tensor * ggml_mul_mat_id(
|
||||||
@ -1075,13 +1097,13 @@ extern "C" {
|
|||||||
GGML_API struct ggml_tensor * ggml_scale(
|
GGML_API struct ggml_tensor * ggml_scale(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
float s);
|
||||||
|
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
float s);
|
||||||
|
|
||||||
// b -> view(a,offset,nb1,nb2,3), return modified a
|
// b -> view(a,offset,nb1,nb2,3), return modified a
|
||||||
GGML_API struct ggml_tensor * ggml_set(
|
GGML_API struct ggml_tensor * ggml_set(
|
||||||
@ -2116,10 +2138,11 @@ extern "C" {
|
|||||||
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
GGML_API const void * gguf_get_arr_data(const struct gguf_context * ctx, int key_id);
|
||||||
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
GGML_API const char * gguf_get_arr_str (const struct gguf_context * ctx, int key_id, int i);
|
||||||
|
|
||||||
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
GGML_API int gguf_get_n_tensors (const struct gguf_context * ctx);
|
||||||
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
GGML_API int gguf_find_tensor (const struct gguf_context * ctx, const char * name);
|
||||||
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
GGML_API size_t gguf_get_tensor_offset(const struct gguf_context * ctx, int i);
|
||||||
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
GGML_API char * gguf_get_tensor_name (const struct gguf_context * ctx, int i);
|
||||||
|
GGML_API enum ggml_type gguf_get_tensor_type (const struct gguf_context * ctx, int i);
|
||||||
|
|
||||||
// overrides existing values or adds a new one
|
// overrides existing values or adds a new one
|
||||||
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
GGML_API void gguf_set_val_u8 (struct gguf_context * ctx, const char * key, uint8_t val);
|
||||||
@ -2175,6 +2198,7 @@ extern "C" {
|
|||||||
//
|
//
|
||||||
|
|
||||||
GGML_API int ggml_cpu_has_avx (void);
|
GGML_API int ggml_cpu_has_avx (void);
|
||||||
|
GGML_API int ggml_cpu_has_avx_vnni (void);
|
||||||
GGML_API int ggml_cpu_has_avx2 (void);
|
GGML_API int ggml_cpu_has_avx2 (void);
|
||||||
GGML_API int ggml_cpu_has_avx512 (void);
|
GGML_API int ggml_cpu_has_avx512 (void);
|
||||||
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
GGML_API int ggml_cpu_has_avx512_vbmi(void);
|
||||||
|
@ -143,20 +143,7 @@ class AudioEncoderANE(AudioEncoder):
|
|||||||
x = block(x)
|
x = block(x)
|
||||||
|
|
||||||
x = self.ln_post(x)
|
x = self.ln_post(x)
|
||||||
|
x = x.squeeze(2).transpose(1, 2)
|
||||||
# """
|
|
||||||
# TODO:
|
|
||||||
# I think we need to transpose the result here to make it fit whisper.cpp memory order.
|
|
||||||
# However, even doing this, the results are still wrong. Kind of less wrong compared to
|
|
||||||
# not transposing, but still wrong.
|
|
||||||
|
|
||||||
# Also, I don't know why the original OpenAI implementation does not need to transpose
|
|
||||||
|
|
||||||
# transpose to (batch_size, n_ctx, n_state)
|
|
||||||
# x : torch.Tensor, shape = (batch_size, n_state, 1, n_ctx)
|
|
||||||
|
|
||||||
# """
|
|
||||||
# x = x.transpose(1,3)
|
|
||||||
|
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
@ -19,7 +19,7 @@ function get_script_path() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
models_path="$(get_script_path)"
|
models_path="${2:-$(get_script_path)}"
|
||||||
|
|
||||||
# Whisper models
|
# Whisper models
|
||||||
models=(
|
models=(
|
||||||
@ -43,7 +43,7 @@ models=(
|
|||||||
"large-v1"
|
"large-v1"
|
||||||
"large-v2"
|
"large-v2"
|
||||||
"large-v3"
|
"large-v3"
|
||||||
"large-q5_0"
|
"large-v3-q5_0"
|
||||||
)
|
)
|
||||||
|
|
||||||
# list available models
|
# list available models
|
||||||
@ -56,8 +56,8 @@ function list_models {
|
|||||||
printf "\n\n"
|
printf "\n\n"
|
||||||
}
|
}
|
||||||
|
|
||||||
if [ "$#" -ne 1 ]; then
|
if [ "$#" -lt 1 ] || [ "$#" -gt 2 ]; then
|
||||||
printf "Usage: $0 <model>\n"
|
printf "Usage: $0 <model> [models_path]\n"
|
||||||
list_models
|
list_models
|
||||||
|
|
||||||
exit 1
|
exit 1
|
||||||
@ -105,7 +105,7 @@ if [ $? -ne 0 ]; then
|
|||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
printf "Done! Model '$model' saved in 'models/ggml-$model.bin'\n"
|
printf "Done! Model '$model' saved in '$models_path/ggml-$model.bin'\n"
|
||||||
printf "You can now use it like this:\n\n"
|
printf "You can now use it like this:\n\n"
|
||||||
printf " $ ./main -m models/ggml-$model.bin -f samples/jfk.wav\n"
|
printf " $ ./main -m $models_path/ggml-$model.bin -f samples/jfk.wav\n"
|
||||||
printf "\n"
|
printf "\n"
|
||||||
|
@ -23,7 +23,7 @@ if [[ $mname == "-h5" ]]; then
|
|||||||
echo $mpath
|
echo $mpath
|
||||||
python3 models/convert-h5-to-coreml.py --model-name $mname --model-path $mpath --encoder-only True
|
python3 models/convert-h5-to-coreml.py --model-name $mname --model-path $mpath --encoder-only True
|
||||||
else
|
else
|
||||||
python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
|
python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True --optimize-ane True
|
||||||
fi
|
fi
|
||||||
|
|
||||||
xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
|
xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
|
||||||
|
@ -64,15 +64,15 @@ int whisper_openvino_encode(
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (mel->n_dims != 2) {
|
if (ggml_n_dims(mel) != 2) {
|
||||||
fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
|
fprintf(stderr, "%s: Error! mel ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
|
||||||
__func__, mel->n_dims);
|
__func__, ggml_n_dims(mel));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (out->n_dims != 2) {
|
if (ggml_n_dims(out) != 2) {
|
||||||
fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
|
fprintf(stderr, "%s: Error! out ggml_tensor expected to have n_dims=2, but it has n_dims=%d\n",
|
||||||
__func__, out->n_dims);
|
__func__, ggml_n_dims(out));
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -105,4 +105,4 @@ int whisper_openvino_encode(
|
|||||||
}
|
}
|
||||||
|
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
136
whisper.cpp
136
whisper.cpp
@ -122,9 +122,18 @@ WHISPER_ATTRIBUTE_FORMAT(2, 3)
|
|||||||
static void whisper_log_internal (ggml_log_level level, const char * format, ...);
|
static void whisper_log_internal (ggml_log_level level, const char * format, ...);
|
||||||
static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
static void whisper_log_callback_default(ggml_log_level level, const char * text, void * user_data);
|
||||||
|
|
||||||
#define WHISPER_LOG_INFO(...) whisper_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
|
||||||
#define WHISPER_LOG_WARN(...) whisper_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
|
||||||
#define WHISPER_LOG_ERROR(...) whisper_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
#define WHISPER_LOG_ERROR(...) whisper_log_internal(GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
||||||
|
#define WHISPER_LOG_WARN(...) whisper_log_internal(GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
||||||
|
#define WHISPER_LOG_INFO(...) whisper_log_internal(GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
||||||
|
|
||||||
|
// define this to enable verbose trace logging - useful for debugging purposes
|
||||||
|
// #define WHISPER_DEBUG
|
||||||
|
|
||||||
|
#if defined(WHISPER_DEBUG)
|
||||||
|
#define WHISPER_LOG_DEBUG(...) whisper_log_internal(GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
||||||
|
#else
|
||||||
|
#define WHISPER_LOG_DEBUG(...)
|
||||||
|
#endif
|
||||||
|
|
||||||
#define WHISPER_ASSERT(x) \
|
#define WHISPER_ASSERT(x) \
|
||||||
do { \
|
do { \
|
||||||
@ -134,18 +143,6 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
|
|||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
// define this to enable verbose trace logging - useful for debugging purposes
|
|
||||||
//#define WHISPER_DEBUG
|
|
||||||
|
|
||||||
#if defined(WHISPER_DEBUG)
|
|
||||||
#define WHISPER_PRINT_DEBUG(...) \
|
|
||||||
do { \
|
|
||||||
fprintf(stderr, __VA_ARGS__); \
|
|
||||||
} while (0)
|
|
||||||
#else
|
|
||||||
#define WHISPER_PRINT_DEBUG(...)
|
|
||||||
#endif
|
|
||||||
|
|
||||||
//#define WHISPER_USE_FLASH_ATTN
|
//#define WHISPER_USE_FLASH_ATTN
|
||||||
//#define WHISPER_USE_FLASH_FF
|
//#define WHISPER_USE_FLASH_FF
|
||||||
#define WHISPER_MAX_DECODERS 8
|
#define WHISPER_MAX_DECODERS 8
|
||||||
@ -155,7 +152,7 @@ static void whisper_log_callback_default(ggml_log_level level, const char * text
|
|||||||
// ggml helpers
|
// ggml helpers
|
||||||
//
|
//
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(
|
static bool ggml_graph_compute_helper(
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph,
|
||||||
std::vector<uint8_t> & buf,
|
std::vector<uint8_t> & buf,
|
||||||
int n_threads,
|
int n_threads,
|
||||||
@ -171,10 +168,10 @@ static void ggml_graph_compute_helper(
|
|||||||
plan.work_data = buf.data();
|
plan.work_data = buf.data();
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_graph_compute(graph, &plan);
|
return ggml_graph_compute(graph, &plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_graph_compute_helper(
|
static bool ggml_graph_compute_helper(
|
||||||
struct ggml_backend * backend,
|
struct ggml_backend * backend,
|
||||||
struct ggml_cgraph * graph,
|
struct ggml_cgraph * graph,
|
||||||
int n_threads) {
|
int n_threads) {
|
||||||
@ -186,7 +183,7 @@ static void ggml_graph_compute_helper(
|
|||||||
ggml_backend_metal_set_n_cb(backend, n_threads);
|
ggml_backend_metal_set_n_cb(backend, n_threads);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
ggml_backend_graph_compute(backend, graph);
|
return ggml_backend_graph_compute(backend, graph);
|
||||||
}
|
}
|
||||||
|
|
||||||
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
|
||||||
@ -487,8 +484,8 @@ static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
|
|||||||
|
|
||||||
// measure the memory usage of a graph and prepare the allocr's internal data buffer
|
// measure the memory usage of a graph and prepare the allocr's internal data buffer
|
||||||
static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
|
static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
|
||||||
auto & alloc = allocr.alloc;
|
auto & alloc = allocr.alloc;
|
||||||
auto & meta = allocr.meta;
|
auto & meta = allocr.meta;
|
||||||
|
|
||||||
alloc = ggml_allocr_new_measure_from_backend(backend);
|
alloc = ggml_allocr_new_measure_from_backend(backend);
|
||||||
|
|
||||||
@ -1777,7 +1774,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|||||||
|
|
||||||
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
|
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
|
||||||
|
|
||||||
ggml_allocr * alloc = wstate.alloc_encode.alloc;
|
//ggml_allocr * alloc = wstate.alloc_encode.alloc;
|
||||||
|
|
||||||
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
|
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_ctx, n_state);
|
||||||
//ggml_allocr_alloc(alloc, cur);
|
//ggml_allocr_alloc(alloc, cur);
|
||||||
@ -1787,13 +1784,7 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|||||||
//}
|
//}
|
||||||
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
|
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_conv);
|
||||||
|
|
||||||
struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
const float KQscale = 1.0f/sqrtf(float(n_state)/n_head);
|
||||||
ggml_allocr_alloc(alloc, KQscale);
|
|
||||||
|
|
||||||
if (!ggml_allocr_is_measure(alloc)) {
|
|
||||||
const float val = 1.0f/sqrtf(float(n_state)/n_head);
|
|
||||||
ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
|
|
||||||
}
|
|
||||||
|
|
||||||
// ===================================================================
|
// ===================================================================
|
||||||
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
// NOTE: experimenting with partial evaluation of the encoder (ignore)
|
||||||
@ -1843,14 +1834,14 @@ static struct ggml_cgraph * whisper_build_graph_encoder(
|
|||||||
|
|
||||||
Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
|
Qcur = ggml_add(ctx0, Qcur, layer.attn_q_b);
|
||||||
|
|
||||||
//Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
//Qcur = ggml_scale(ctx0, Qcur, pow(float(n_state)/n_head, -0.25));
|
||||||
|
|
||||||
// note: no bias for Key
|
// note: no bias for Key
|
||||||
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
|
||||||
layer.attn_k_w,
|
layer.attn_k_w,
|
||||||
cur);
|
cur);
|
||||||
|
|
||||||
//Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
|
//Kcur = ggml_scale(ctx0, Kcur, pow(float(n_state)/n_head, -0.25));
|
||||||
|
|
||||||
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
|
||||||
layer.attn_v_w,
|
layer.attn_v_w,
|
||||||
@ -2032,7 +2023,7 @@ static struct ggml_cgraph * whisper_build_graph_cross(
|
|||||||
|
|
||||||
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
ggml_cgraph * gf = ggml_new_graph(ctx0);
|
||||||
|
|
||||||
ggml_allocr * alloc = wstate.alloc_cross.alloc;
|
//ggml_allocr * alloc = wstate.alloc_cross.alloc;
|
||||||
|
|
||||||
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
//struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
|
||||||
//ggml_allocr_alloc(alloc, cur);
|
//ggml_allocr_alloc(alloc, cur);
|
||||||
@ -2042,13 +2033,7 @@ static struct ggml_cgraph * whisper_build_graph_cross(
|
|||||||
//}
|
//}
|
||||||
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
|
struct ggml_tensor * cur = ggml_view_tensor(ctx0, wstate.embd_enc);
|
||||||
|
|
||||||
struct ggml_tensor * Kscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
const float Kscale = pow(float(n_state) / n_head, -0.25);
|
||||||
ggml_allocr_alloc(alloc, Kscale);
|
|
||||||
|
|
||||||
if (!ggml_allocr_is_measure(alloc)) {
|
|
||||||
const float val = pow(float(n_state) / n_head, -0.25);
|
|
||||||
ggml_backend_tensor_set(Kscale, &val, 0, sizeof(float));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
|
for (int il = 0; il < model.hparams.n_text_layer; ++il) {
|
||||||
auto & layer = model.layers_decoder[il];
|
auto & layer = model.layers_decoder[il];
|
||||||
@ -2118,7 +2103,9 @@ static bool whisper_encode_internal(
|
|||||||
ggml_allocr_alloc_graph(alloc, gf);
|
ggml_allocr_alloc_graph(alloc, gf);
|
||||||
|
|
||||||
if (!whisper_encode_external(wstate)) {
|
if (!whisper_encode_external(wstate)) {
|
||||||
ggml_graph_compute_helper(wstate.backend, gf, n_threads);
|
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -2132,7 +2119,9 @@ static bool whisper_encode_internal(
|
|||||||
|
|
||||||
ggml_allocr_alloc_graph(alloc, gf);
|
ggml_allocr_alloc_graph(alloc, gf);
|
||||||
|
|
||||||
ggml_graph_compute_helper(wstate.backend, gf, n_threads);
|
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// cross
|
// cross
|
||||||
@ -2145,7 +2134,9 @@ static bool whisper_encode_internal(
|
|||||||
|
|
||||||
ggml_allocr_alloc_graph(alloc, gf);
|
ggml_allocr_alloc_graph(alloc, gf);
|
||||||
|
|
||||||
ggml_graph_compute_helper(wstate.backend, gf, n_threads);
|
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
wstate.t_encode_us += ggml_time_us() - t_start_us;
|
wstate.t_encode_us += ggml_time_us() - t_start_us;
|
||||||
@ -2178,7 +2169,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|||||||
const int32_t n_kv = ggml_allocr_is_measure(alloc) ? n_ctx : kv_self.n;
|
const int32_t n_kv = ggml_allocr_is_measure(alloc) ? n_ctx : kv_self.n;
|
||||||
const int32_t kv_head = ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head;
|
const int32_t kv_head = ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head;
|
||||||
|
|
||||||
//WHISPER_PRINT_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
|
//WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
|
||||||
|
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/*.mem_size =*/ wstate.alloc_decode.meta.size(),
|
/*.mem_size =*/ wstate.alloc_decode.meta.size(),
|
||||||
@ -2207,13 +2198,7 @@ static struct ggml_cgraph * whisper_build_graph_decoder(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * KQscale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
const float KQscale = pow(float(n_state)/n_head, -0.25);
|
||||||
ggml_allocr_alloc(alloc, KQscale);
|
|
||||||
|
|
||||||
if (!ggml_allocr_is_measure(alloc)) {
|
|
||||||
const float val = pow(float(n_state)/n_head, -0.25);
|
|
||||||
ggml_backend_tensor_set(KQscale, &val, 0, sizeof(float));
|
|
||||||
}
|
|
||||||
|
|
||||||
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
||||||
ggml_allocr_alloc(alloc, KQ_mask);
|
ggml_allocr_alloc(alloc, KQ_mask);
|
||||||
@ -2573,7 +2558,9 @@ static bool whisper_decode_internal(
|
|||||||
|
|
||||||
logits = gf->nodes[gf->n_nodes - 1];
|
logits = gf->nodes[gf->n_nodes - 1];
|
||||||
|
|
||||||
ggml_graph_compute_helper(wstate.backend, gf, n_threads);
|
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logits_out.resize(n_tokens*n_vocab);
|
logits_out.resize(n_tokens*n_vocab);
|
||||||
@ -3834,6 +3821,7 @@ void whisper_reset_timings(struct whisper_context * ctx) {
|
|||||||
ctx->state->t_sample_us = 0;
|
ctx->state->t_sample_us = 0;
|
||||||
ctx->state->t_encode_us = 0;
|
ctx->state->t_encode_us = 0;
|
||||||
ctx->state->t_decode_us = 0;
|
ctx->state->t_decode_us = 0;
|
||||||
|
ctx->state->t_batchd_us = 0;
|
||||||
ctx->state->t_prompt_us = 0;
|
ctx->state->t_prompt_us = 0;
|
||||||
ctx->state->n_sample = 0;
|
ctx->state->n_sample = 0;
|
||||||
ctx->state->n_encode = 0;
|
ctx->state->n_encode = 0;
|
||||||
@ -4962,7 +4950,7 @@ static void whisper_sequence_score(
|
|||||||
const auto p = kv.second/(double)cnt;
|
const auto p = kv.second/(double)cnt;
|
||||||
entropy -= p*log(p);
|
entropy -= p*log(p);
|
||||||
|
|
||||||
//WHISPER_PRINT_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
|
//WHISPER_LOG_DEBUG("entropy: %d %f %f, count %d\n", kv.first, p, log(p), kv.second);
|
||||||
}
|
}
|
||||||
|
|
||||||
sequence.entropy = entropy;
|
sequence.entropy = entropy;
|
||||||
@ -5028,7 +5016,7 @@ int whisper_full_with_state(
|
|||||||
// basically don't process anything that is less than 1.0s
|
// basically don't process anything that is less than 1.0s
|
||||||
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
|
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
|
||||||
if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
|
if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
|
||||||
WHISPER_PRINT_DEBUG("%s: input is too short - %d ms < 1000 ms\n", __func__, (seek_end - seek_start)*10);
|
WHISPER_LOG_DEBUG("%s: input is too short - %d ms < 1000 ms\n", __func__, (seek_end - seek_start)*10);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5217,7 +5205,7 @@ int whisper_full_with_state(
|
|||||||
|
|
||||||
n_decoders_cur = std::max(1, n_decoders_cur);
|
n_decoders_cur = std::max(1, n_decoders_cur);
|
||||||
|
|
||||||
WHISPER_PRINT_DEBUG("\n%s: strategy = %d, decoding with %d decoders, temperature = %.2f\n", __func__, params.strategy, n_decoders_cur, t_cur);
|
WHISPER_LOG_DEBUG("\n%s: strategy = %d, decoding with %d decoders, temperature = %.2f\n", __func__, params.strategy, n_decoders_cur, t_cur);
|
||||||
|
|
||||||
// TAGS: WHISPER_DECODER_INIT
|
// TAGS: WHISPER_DECODER_INIT
|
||||||
for (int j = 0; j < n_decoders_cur; ++j) {
|
for (int j = 0; j < n_decoders_cur; ++j) {
|
||||||
@ -5261,11 +5249,11 @@ int whisper_full_with_state(
|
|||||||
prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
|
prompt.insert(prompt.end(), prompt_init.begin(), prompt_init.end());
|
||||||
|
|
||||||
// print the prompt
|
// print the prompt
|
||||||
WHISPER_PRINT_DEBUG("\n\n");
|
WHISPER_LOG_DEBUG("\n\n");
|
||||||
for (int i = 0; i < (int) prompt.size(); i++) {
|
for (int i = 0; i < (int) prompt.size(); i++) {
|
||||||
WHISPER_PRINT_DEBUG("%s: prompt[%d] = %s\n", __func__, i, ctx->vocab.id_to_token.at(prompt[i]).c_str());
|
WHISPER_LOG_DEBUG("%s: prompt[%d] = %s\n", __func__, i, ctx->vocab.id_to_token.at(prompt[i]).c_str());
|
||||||
}
|
}
|
||||||
WHISPER_PRINT_DEBUG("\n\n");
|
WHISPER_LOG_DEBUG("\n\n");
|
||||||
|
|
||||||
whisper_kv_cache_clear(state->kv_self);
|
whisper_kv_cache_clear(state->kv_self);
|
||||||
|
|
||||||
@ -5413,7 +5401,7 @@ int whisper_full_with_state(
|
|||||||
|
|
||||||
whisper_kv_cache_seq_cp(state->kv_self, cur.decoder_idx, WHISPER_MAX_DECODERS + j, -1, -1);
|
whisper_kv_cache_seq_cp(state->kv_self, cur.decoder_idx, WHISPER_MAX_DECODERS + j, -1, -1);
|
||||||
|
|
||||||
WHISPER_PRINT_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
|
WHISPER_LOG_DEBUG("%s: beam search: decoder %d: from decoder %d: token = %10s, plog = %8.5f, sum_logprobs = %8.5f\n",
|
||||||
__func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
|
__func__, j, cur.decoder_idx, ctx->vocab.id_to_token.at(decoder.sequence.tokens.back().id).c_str(), decoder.sequence.tokens.back().plog, decoder.sequence.sum_logprobs_all);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -5456,7 +5444,7 @@ int whisper_full_with_state(
|
|||||||
|
|
||||||
// do not allow to go back in time
|
// do not allow to go back in time
|
||||||
if (has_ts && seek_delta > seek_delta_new && result_len < i) {
|
if (has_ts && seek_delta > seek_delta_new && result_len < i) {
|
||||||
WHISPER_PRINT_DEBUG("%s: decoder %d: failed due to seek_delta (%d > %d)\n", __func__, j, seek_delta, seek_delta_new);
|
WHISPER_LOG_DEBUG("%s: decoder %d: failed due to seek_delta (%d > %d)\n", __func__, j, seek_delta, seek_delta_new);
|
||||||
failed = true; // TODO: maybe this is not a failure ?
|
failed = true; // TODO: maybe this is not a failure ?
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -5471,7 +5459,7 @@ int whisper_full_with_state(
|
|||||||
#ifdef WHISPER_DEBUG
|
#ifdef WHISPER_DEBUG
|
||||||
{
|
{
|
||||||
const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token.at(token.tid) : "[?]";
|
const auto tt = token.pt > 0.10 ? ctx->vocab.id_to_token.at(token.tid) : "[?]";
|
||||||
WHISPER_PRINT_DEBUG("%s: id = %3d, decoder = %d, token = %6d, p = %6.3f, ts = %10s, %6.3f, result_len = %4d '%s'\n",
|
WHISPER_LOG_DEBUG("%s: id = %3d, decoder = %d, token = %6d, p = %6.3f, ts = %10s, %6.3f, result_len = %4d '%s'\n",
|
||||||
__func__, i, j, token.id, token.p, tt.c_str(), token.pt, result_len, ctx->vocab.id_to_token.at(token.id).c_str());
|
__func__, i, j, token.id, token.p, tt.c_str(), token.pt, result_len, ctx->vocab.id_to_token.at(token.id).c_str());
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -5485,7 +5473,7 @@ int whisper_full_with_state(
|
|||||||
if (seek + seek_delta + 100 >= seek_end) {
|
if (seek + seek_delta + 100 >= seek_end) {
|
||||||
result_len = i + 1;
|
result_len = i + 1;
|
||||||
} else {
|
} else {
|
||||||
WHISPER_PRINT_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);
|
WHISPER_LOG_DEBUG("%s: decoder %d failed (result_len = 0)\n", __func__, j);
|
||||||
failed = true;
|
failed = true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -5496,7 +5484,7 @@ int whisper_full_with_state(
|
|||||||
seek_delta = 100*WHISPER_CHUNK_SIZE;
|
seek_delta = 100*WHISPER_CHUNK_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
WHISPER_PRINT_DEBUG("%s: decoder %d completed\n", __func__, j);
|
WHISPER_LOG_DEBUG("%s: decoder %d completed\n", __func__, j);
|
||||||
completed = true;
|
completed = true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -5512,7 +5500,7 @@ int whisper_full_with_state(
|
|||||||
// sometimes, the decoding can get stuck in a repetition loop
|
// sometimes, the decoding can get stuck in a repetition loop
|
||||||
// this is an attempt to mitigate such cases - we flag the decoding as failed and use a fallback strategy
|
// this is an attempt to mitigate such cases - we flag the decoding as failed and use a fallback strategy
|
||||||
if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
|
if (i == n_max - 1 && (result_len == 0 || seek_delta < 100*WHISPER_CHUNK_SIZE/2)) {
|
||||||
WHISPER_PRINT_DEBUG("%s: decoder %d: failed due to repetition loop\n", __func__, j);
|
WHISPER_LOG_DEBUG("%s: decoder %d: failed due to repetition loop\n", __func__, j);
|
||||||
failed = true;
|
failed = true;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
@ -5554,7 +5542,7 @@ int whisper_full_with_state(
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
//WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, seek_delta %d\n", __func__, j, decoder.sequence.tokens.back().id, decoder.seek_delta);
|
//WHISPER_LOG_DEBUG("%s: decoder %d: token %d, seek_delta %d\n", __func__, j, decoder.sequence.tokens.back().id, decoder.seek_delta);
|
||||||
|
|
||||||
decoder.i_batch = batch.n_tokens;
|
decoder.i_batch = batch.n_tokens;
|
||||||
|
|
||||||
@ -5634,11 +5622,11 @@ int whisper_full_with_state(
|
|||||||
decoder.sequence.tokens.resize(decoder.sequence.result_len);
|
decoder.sequence.tokens.resize(decoder.sequence.result_len);
|
||||||
whisper_sequence_score(params, decoder.sequence);
|
whisper_sequence_score(params, decoder.sequence);
|
||||||
|
|
||||||
WHISPER_PRINT_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
|
WHISPER_LOG_DEBUG("%s: decoder %2d: score = %8.5f, result_len = %3d, avg_logprobs = %8.5f, entropy = %8.5f\n",
|
||||||
__func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);
|
__func__, j, decoder.sequence.score, decoder.sequence.result_len, decoder.sequence.avg_logprobs, decoder.sequence.entropy);
|
||||||
|
|
||||||
if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
|
if (decoder.sequence.result_len > 32 && decoder.sequence.entropy < params.entropy_thold) {
|
||||||
WHISPER_PRINT_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
|
WHISPER_LOG_DEBUG("%s: decoder %2d: failed due to entropy %8.5f < %8.5f\n",
|
||||||
__func__, j, decoder.sequence.entropy, params.entropy_thold);
|
__func__, j, decoder.sequence.entropy, params.entropy_thold);
|
||||||
|
|
||||||
decoder.failed = true;
|
decoder.failed = true;
|
||||||
@ -5653,7 +5641,7 @@ int whisper_full_with_state(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
WHISPER_PRINT_DEBUG("%s: best decoder = %d\n", __func__, best_decoder_id);
|
WHISPER_LOG_DEBUG("%s: best decoder = %d\n", __func__, best_decoder_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool success = true;
|
bool success = true;
|
||||||
@ -5665,7 +5653,7 @@ int whisper_full_with_state(
|
|||||||
const auto & decoder = state->decoders[best_decoder_id];
|
const auto & decoder = state->decoders[best_decoder_id];
|
||||||
|
|
||||||
if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
|
if (decoder.failed || decoder.sequence.avg_logprobs < params.logprob_thold) {
|
||||||
WHISPER_PRINT_DEBUG("%s: failed due to avg_logprobs %8.5f < %8.5f\n", __func__, decoder.sequence.avg_logprobs, params.logprob_thold);
|
WHISPER_LOG_DEBUG("%s: failed due to avg_logprobs %8.5f < %8.5f\n", __func__, decoder.sequence.avg_logprobs, params.logprob_thold);
|
||||||
success = false;
|
success = false;
|
||||||
state->n_fail_p++;
|
state->n_fail_p++;
|
||||||
}
|
}
|
||||||
@ -5673,13 +5661,13 @@ int whisper_full_with_state(
|
|||||||
|
|
||||||
if (success) {
|
if (success) {
|
||||||
//for (auto & token : ctx->decoders[best_decoder_id].sequence.tokens) {
|
//for (auto & token : ctx->decoders[best_decoder_id].sequence.tokens) {
|
||||||
// WHISPER_PRINT_DEBUG("%s: token = %d, p = %6.3f, pt = %6.3f, ts = %s, str = %s\n", __func__, token.id, token.p, token.pt, ctx->vocab.id_to_token.at(token.tid).c_str(), ctx->vocab.id_to_token.at(token.id).c_str());
|
// WHISPER_LOG_DEBUG("%s: token = %d, p = %6.3f, pt = %6.3f, ts = %s, str = %s\n", __func__, token.id, token.p, token.pt, ctx->vocab.id_to_token.at(token.tid).c_str(), ctx->vocab.id_to_token.at(token.id).c_str());
|
||||||
//}
|
//}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
WHISPER_PRINT_DEBUG("\n%s: failed to decode with temperature = %.2f\n", __func__, t_cur);
|
WHISPER_LOG_DEBUG("\n%s: failed to decode with temperature = %.2f\n", __func__, t_cur);
|
||||||
}
|
}
|
||||||
|
|
||||||
// output results through a user-provided callback
|
// output results through a user-provided callback
|
||||||
@ -5691,7 +5679,7 @@ int whisper_full_with_state(
|
|||||||
|
|
||||||
const auto & tokens_cur = best_decoder.sequence.tokens;
|
const auto & tokens_cur = best_decoder.sequence.tokens;
|
||||||
|
|
||||||
//WHISPER_PRINT_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
|
//WHISPER_LOG_DEBUG("prompt_init.size() = %d, prompt.size() = %d, result_len = %d, seek_delta = %d\n", prompt_init.size(), prompt.size(), result_len, seek_delta);
|
||||||
|
|
||||||
// update prompt_past
|
// update prompt_past
|
||||||
prompt_past.clear();
|
prompt_past.clear();
|
||||||
@ -5811,7 +5799,7 @@ int whisper_full_with_state(
|
|||||||
// update audio window
|
// update audio window
|
||||||
seek += seek_delta;
|
seek += seek_delta;
|
||||||
|
|
||||||
WHISPER_PRINT_DEBUG("seek = %d, seek_delta = %d\n", seek, seek_delta);
|
WHISPER_LOG_DEBUG("seek = %d, seek_delta = %d\n", seek, seek_delta);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6128,7 +6116,7 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|||||||
|
|
||||||
// multi-thread
|
// multi-thread
|
||||||
|
|
||||||
for (uint32_t k = 1; k <= n_threads; k++) {
|
for (int32_t k = 1; k <= n_threads; k++) {
|
||||||
char * src = (char *) malloc(size);
|
char * src = (char *) malloc(size);
|
||||||
char * dst = (char *) malloc(size);
|
char * dst = (char *) malloc(size);
|
||||||
|
|
||||||
@ -6152,13 +6140,13 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) {
|
|||||||
const int64_t t0 = ggml_time_us();
|
const int64_t t0 = ggml_time_us();
|
||||||
|
|
||||||
std::vector<std::thread> threads(k - 1);
|
std::vector<std::thread> threads(k - 1);
|
||||||
for (uint32_t th = 0; th < k - 1; ++th) {
|
for (int32_t th = 0; th < k - 1; ++th) {
|
||||||
threads[th] = std::thread(helper, th);
|
threads[th] = std::thread(helper, th);
|
||||||
}
|
}
|
||||||
|
|
||||||
helper(k - 1);
|
helper(k - 1);
|
||||||
|
|
||||||
for (uint32_t th = 0; th < k - 1; ++th) {
|
for (int32_t th = 0; th < k - 1; ++th) {
|
||||||
threads[th].join();
|
threads[th].join();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user