mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-08-11 01:34:04 +02:00
Compare commits
1 Commits
gg/whisper
...
ci/env
Author | SHA1 | Date | |
---|---|---|---|
05ce7476ae |
458
.github/workflows/build.yml
vendored
458
.github/workflows/build.yml
vendored
@ -6,81 +6,17 @@ on:
|
||||
- master
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened]
|
||||
workflow_dispatch:
|
||||
inputs:
|
||||
create_release:
|
||||
description: 'Create new release'
|
||||
required: true
|
||||
type: boolean
|
||||
pre_release_tag:
|
||||
description: 'Pre-release tag name'
|
||||
required: false
|
||||
type: string
|
||||
run_type:
|
||||
description: 'Workflow type to run'
|
||||
required: true
|
||||
type: choice
|
||||
options:
|
||||
- full-ci
|
||||
- release-only
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref && github.ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
permissions:
|
||||
contents: write # for creating release
|
||||
|
||||
env:
|
||||
BRANCH_NAME: ${{ github.head_ref || github.ref_name }}
|
||||
ubuntu_image: "ubuntu:22.04"
|
||||
VCPKG_BINARY_SOURCES: "clear;x-gha,readwrite"
|
||||
|
||||
jobs:
|
||||
determine-tag:
|
||||
runs-on: ubuntu-latest
|
||||
outputs:
|
||||
tag_name: ${{ steps.tag.outputs.name }}
|
||||
|
||||
steps:
|
||||
- name: Checkout with full history
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Determine tag name
|
||||
id: tag
|
||||
shell: bash
|
||||
run: |
|
||||
BUILD_NUMBER=$(git rev-list --count HEAD)
|
||||
SHORT_HASH=$(git rev-parse --short=7 HEAD)
|
||||
CUSTOM_TAG="${{ github.event.inputs.pre_release_tag }}"
|
||||
|
||||
echo "Raw values:"
|
||||
echo "BUILD_NUMBER: $BUILD_NUMBER"
|
||||
echo "SHORT_HASH: $SHORT_HASH"
|
||||
echo "BRANCH_NAME: ${{ env.BRANCH_NAME }}"
|
||||
echo "CUSTOM_TAG: $CUSTOM_TAG"
|
||||
|
||||
# Use custom tag if provided
|
||||
if [[ -n "$CUSTOM_TAG" ]]; then
|
||||
echo "Using custom tag"
|
||||
TAG_NAME="${CUSTOM_TAG}"
|
||||
elif [[ "${{ env.BRANCH_NAME }}" == "master" ]]; then
|
||||
echo "Using master branch format"
|
||||
TAG_NAME="b${BUILD_NUMBER}"
|
||||
else
|
||||
echo "Using non-master branch format"
|
||||
SAFE_NAME=$(echo "${{ env.BRANCH_NAME }}" | tr '/' '-')
|
||||
TAG_NAME="${SAFE_NAME}-b${BUILD_NUMBER}-${SHORT_HASH}"
|
||||
fi
|
||||
|
||||
echo "Final tag name: $TAG_NAME"
|
||||
echo "name=$TAG_NAME" >> $GITHUB_OUTPUT
|
||||
|
||||
ubuntu-22:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -107,8 +43,6 @@ jobs:
|
||||
cmake --build build --config Release -j $(nproc)'
|
||||
|
||||
ubuntu-22-arm64:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -135,8 +69,6 @@ jobs:
|
||||
cmake --build build --config Release -j $(nproc)'
|
||||
|
||||
ubuntu-22-arm-v7:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -163,8 +95,6 @@ jobs:
|
||||
cmake --build build --config Release -j $(nproc)'
|
||||
|
||||
macOS-latest:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: macOS-latest
|
||||
|
||||
strategy:
|
||||
@ -199,28 +129,31 @@ jobs:
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"
|
||||
cmake --build build --config Release -j $(sysctl -n hw.logicalcpu)
|
||||
|
||||
- name: xcodebuild for swift package
|
||||
id: xcodebuild
|
||||
run: |
|
||||
./build-xcframework.sh
|
||||
|
||||
freeBSD-latest:
|
||||
runs-on: macos-13
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Build
|
||||
uses: cross-platform-actions/action@v0.27.0
|
||||
with:
|
||||
operating_system: freebsd
|
||||
version: '14.2'
|
||||
run: |
|
||||
sudo pkg update
|
||||
sudo pkg install -y gmake sdl2 cmake git
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
# freeBSD-latest:
|
||||
# runs-on: macos-12
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: Build
|
||||
# uses: cross-platform-actions/action@v0.24.0
|
||||
# with:
|
||||
# operating_system: freebsd
|
||||
# version: '13.3'
|
||||
# run: |
|
||||
# sudo pkg update
|
||||
# sudo pkg install -y gmake sdl2 cmake
|
||||
# cmake -B build
|
||||
# cmake --build build --config Release
|
||||
|
||||
ubuntu-22-gcc:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -249,8 +182,6 @@ jobs:
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-22-gcc-arm64:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -279,8 +210,6 @@ jobs:
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-22-gcc-arm-v7:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -309,8 +238,6 @@ jobs:
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-22-clang:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -342,8 +269,6 @@ jobs:
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-22-gcc-sanitized:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -367,15 +292,11 @@ jobs:
|
||||
set -e
|
||||
apt update
|
||||
apt install -y build-essential cmake git
|
||||
cmake . -DCMAKE_BUILD_TYPE=Debug \
|
||||
-DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON \
|
||||
-DGGML_OPENMP=OFF
|
||||
cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
|
||||
ubuntu-22-cmake-sycl:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -426,8 +347,6 @@ jobs:
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
ubuntu-22-cmake-sycl-fp16:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -478,8 +397,6 @@ jobs:
|
||||
cmake --build . --config Release -j $(nproc)
|
||||
|
||||
windows-msys2:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
@ -524,8 +441,6 @@ jobs:
|
||||
cmake --build build --config ${{ matrix.build }} -j $(nproc)
|
||||
|
||||
windows:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
@ -586,8 +501,6 @@ jobs:
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
||||
windows-blas:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: windows-latest
|
||||
|
||||
strategy:
|
||||
@ -661,8 +574,6 @@ jobs:
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
||||
windows-cublas:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: windows-2019
|
||||
strategy:
|
||||
matrix:
|
||||
@ -679,134 +590,15 @@ jobs:
|
||||
- name: Clone repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install Ninja
|
||||
id: install_ninja
|
||||
run: |
|
||||
choco install ninja
|
||||
|
||||
- name: Install ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: ${{ github.job }}-${{ matrix.cuda-toolkit }}-${{ matrix.build }}
|
||||
variant: sccache
|
||||
evict-old-files: 5d
|
||||
|
||||
- name: Install Cuda Toolkit 11.8.0
|
||||
if: ${{ matrix.cuda-toolkit == '11.8.0' }}
|
||||
run: |
|
||||
$CUDA_VERSION = ${{ matrix.cuda-toolkit }}
|
||||
$CUDA_TOOLKIT_DIR = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$CUDA_VERSION"
|
||||
$CUDA_DOWNLOAD = "https://developer.download.nvidia.com/compute/cuda/redist"
|
||||
|
||||
# Components versions
|
||||
$CUDART_VER = "11.8.89"
|
||||
$NVCC_VER = "11.8.89"
|
||||
$NVRTC_VER = "11.8.89"
|
||||
$CUBLAS_VER = "11.8.1.74"
|
||||
$NVTX_VER = "11.8.86"
|
||||
$VS_VER = "11.8.86"
|
||||
$NVPROF_VER = "11.8.87"
|
||||
$CCCL_VER = "11.8.89"
|
||||
|
||||
# Create the directory where the CUDA Toolkit will be installed
|
||||
mkdir -p $CUDA_TOOLKIT_DIR
|
||||
|
||||
# Install unzip to extract the downloaded files
|
||||
choco install unzip -y
|
||||
|
||||
# Download all the required components
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-${CUDART_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-${NVCC_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-${NVRTC_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/libcublas/windows-x86_64/libcublas-windows-x86_64-${CUBLAS_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-${NVTX_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-${VS_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-${NVPROF_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-${CCCL_VER}-archive.zip"
|
||||
|
||||
# Extract all the downloaded files to the CUDA Toolkit directory
|
||||
unzip '*.zip' -d $CUDA_TOOLKIT_DIR
|
||||
|
||||
# Copy all the extracted files to the main CUDA Toolkit directory
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_cudart-windows-x86_64-${CUDART_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_nvcc-windows-x86_64-${NVCC_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_nvrtc-windows-x86_64-${NVRTC_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\libcublas-windows-x86_64-${CUBLAS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_nvtx-windows-x86_64-${NVTX_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_nvprof-windows-x86_64-${NVPROF_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_cccl-windows-x86_64-${CCCL_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
|
||||
# Visual Studio integration
|
||||
xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\v160\BuildCustomizations" /E /I /H /Y
|
||||
|
||||
# Set environment variables
|
||||
echo "$CUDA_TOOLKIT_DIR\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "$CUDA_TOOLKIT_DIR\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V11_8=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Install Cuda Toolkit 12.2.0
|
||||
if: ${{ matrix.cuda-toolkit == '12.2.0' }}
|
||||
run: |
|
||||
$CUDA_VERSION = ${{ matrix.cuda-toolkit }}
|
||||
$CUDA_TOOLKIT_DIR = "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$CUDA_VERSION"
|
||||
$CUDA_DOWNLOAD = "https://developer.download.nvidia.com/compute/cuda/redist"
|
||||
|
||||
# Components versions
|
||||
$CUDART_VER = "12.2.140"
|
||||
$NVCC_VER = "12.2.140"
|
||||
$NVRTC_VER = "12.2.140"
|
||||
$CUBLAS_VER = "12.2.5.6"
|
||||
$NVTX_VER = "12.2.140"
|
||||
$PROFILER_VER = "12.2.140"
|
||||
$VS_VER = "12.2.140"
|
||||
$NVPROF_VER = "12.2.142"
|
||||
$CCCL_VER = "12.2.140"
|
||||
|
||||
# Create the directory where the CUDA Toolkit will be installed
|
||||
mkdir -p $CUDA_TOOLKIT_DIR
|
||||
|
||||
# Install unzip to extract the downloaded files
|
||||
choco install unzip -y
|
||||
|
||||
# Download all the required components
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_cudart/windows-x86_64/cuda_cudart-windows-x86_64-${CUDART_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_nvcc/windows-x86_64/cuda_nvcc-windows-x86_64-${NVCC_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_nvrtc/windows-x86_64/cuda_nvrtc-windows-x86_64-${NVRTC_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/libcublas/windows-x86_64/libcublas-windows-x86_64-${CUBLAS_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_nvtx/windows-x86_64/cuda_nvtx-windows-x86_64-${NVTX_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_profiler_api/windows-x86_64/cuda_profiler_api-windows-x86_64-${PROFILER_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/visual_studio_integration/windows-x86_64/visual_studio_integration-windows-x86_64-${VS_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_nvprof/windows-x86_64/cuda_nvprof-windows-x86_64-${NVPROF_VER}-archive.zip"
|
||||
curl -O "$CUDA_DOWNLOAD/cuda_cccl/windows-x86_64/cuda_cccl-windows-x86_64-${CCCL_VER}-archive.zip"
|
||||
|
||||
# Extract all the downloaded files to the CUDA Toolkit directory
|
||||
unzip -q '*.zip' -d $CUDA_TOOLKIT_DIR
|
||||
|
||||
# Copy all the extracted files to the main CUDA Toolkit directory
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_cudart-windows-x86_64-${CUDART_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_nvcc-windows-x86_64-${NVCC_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_nvrtc-windows-x86_64-${NVRTC_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\libcublas-windows-x86_64-${CUBLAS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_nvtx-windows-x86_64-${NVTX_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_nvprof-windows-x86_64-${NVPROF_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_cccl-windows-x86_64-${CCCL_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\cuda_profiler_api-windows-x86_64-${PROFILER_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\*" "$CUDA_TOOLKIT_DIR" /E /I /H /Y
|
||||
|
||||
# Visual Studio integration
|
||||
xcopy "$CUDA_TOOLKIT_DIR\visual_studio_integration-windows-x86_64-${VS_VER}-archive\visual_studio_integration\MSBuildExtensions\*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\MSBuild\Microsoft\VC\v160\BuildCustomizations" /E /I /H /Y
|
||||
|
||||
# Set environment variables
|
||||
echo "$CUDA_TOOLKIT_DIR\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "$CUDA_TOOLKIT_DIR\libnvvp" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
|
||||
echo "CUDA_PATH=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
echo "CUDA_PATH_V12_2=$CUDA_TOOLKIT_DIR" | Out-File -FilePath $env:GITHUB_ENV -Append -Encoding utf8
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v2
|
||||
|
||||
- name: Install CUDA Toolkit
|
||||
id: cuda-toolkit
|
||||
uses: Jimver/cuda-toolkit@v0.2.15
|
||||
with:
|
||||
cuda: '${{ matrix.cuda-toolkit }}'
|
||||
|
||||
- name: Install 7-Zip
|
||||
run: choco install 7zip -y
|
||||
|
||||
@ -818,30 +610,25 @@ jobs:
|
||||
echo "SDL2_DIR=${{ github.workspace }}\SDL2-${{ matrix.sdl2_ver }}\cmake" | Out-File -FilePath $env:GITHUB_ENV -Append
|
||||
echo "${{ github.workspace }}\SDL2-${{ matrix.sdl2_ver }}\cmake" > SDL2_PATH.txt
|
||||
|
||||
- name: Install cmake
|
||||
run: choco install cmake
|
||||
- name: Configure CMake
|
||||
shell: cmd
|
||||
run: |
|
||||
cmake -S . -B ./build -A ${{ matrix.arch }} ^
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }} ^
|
||||
-DGGML_CUDA=${{ matrix.cublas }} ^
|
||||
-DCMAKE_CUDA_ARCHITECTURES=all ^
|
||||
-DWHISPER_SDL2=${{ matrix.sdl2 }} ^
|
||||
-DSDL2_DIR="%SDL2_DIR%"
|
||||
|
||||
- name: Build Project
|
||||
shell: cmd
|
||||
run: |
|
||||
call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
|
||||
cmake --version
|
||||
where cmake
|
||||
cmake -S . -B build -G "Ninja Multi-Config" ^
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }} ^
|
||||
-DGGML_CUDA=${{ matrix.cublas }} ^
|
||||
-DWHISPER_SDL2=${{ matrix.sdl2 }} ^
|
||||
-DSDL2_DIR="%SDL2_DIR%"
|
||||
set /A NINJA_JOBS=%NUMBER_OF_PROCESSORS%-1
|
||||
cmake --build build --config ${{ matrix.build }} -j %NUMBER_OF_PROCESSORS%
|
||||
|
||||
- name: Check sccache status after build
|
||||
run: |
|
||||
sccache --show-stats
|
||||
cd ./build
|
||||
cmake --build . --config ${{ matrix.build }}
|
||||
|
||||
- name: Copy CUDA DLLs
|
||||
run: |
|
||||
Get-ChildItem "$env:CUDA_PATH\bin\" -Filter "*.dll" |
|
||||
Get-ChildItem "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/" -Filter "*.dll" |
|
||||
Copy-Item -Destination "build/bin/${{ matrix.build }}"
|
||||
|
||||
- name: Copy SDL2.dll
|
||||
@ -855,8 +642,6 @@ jobs:
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
||||
emscripten:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
strategy:
|
||||
@ -880,7 +665,6 @@ jobs:
|
||||
|
||||
ios-xcode-build:
|
||||
runs-on: macos-latest
|
||||
needs: determine-tag
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
@ -923,26 +707,7 @@ jobs:
|
||||
- name: Build swiftui example
|
||||
run: xcodebuild -project examples/whisper.swiftui/whisper.swiftui.xcodeproj -scheme WhisperCppDemo -configuration ${{ matrix.build }} -sdk iphoneos CODE_SIGNING_REQUIRED=NO CODE_SIGN_IDENTITY= -destination 'generic/platform=iOS' FRAMEWORK_FOLDER_PATH=./build-ios build
|
||||
|
||||
- name: Pack artifacts
|
||||
id: pack_artifacts
|
||||
if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
|
||||
github.event.inputs.create_release == 'true' ||
|
||||
github.event.inputs.pre_release_tag != '' }}
|
||||
run: |
|
||||
zip --symlinks -r whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework.zip build-apple/whisper.xcframework
|
||||
|
||||
- name: Upload artifacts
|
||||
if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
|
||||
github.event.inputs.create_release == 'true' ||
|
||||
github.event.inputs.pre_release_tag != '' }}
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
path: whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework.zip
|
||||
name: whisper-${{ needs.determine-tag.outputs.tag_name }}-xcframework
|
||||
|
||||
android:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
@ -971,30 +736,31 @@ jobs:
|
||||
cd whisper/examples/whisper.android
|
||||
./gradlew assembleRelease --no-daemon
|
||||
|
||||
android_java:
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: set up JDK 11
|
||||
uses: actions/setup-java@v4
|
||||
with:
|
||||
java-version: '11'
|
||||
distribution: 'temurin'
|
||||
cache: gradle
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@v3
|
||||
with:
|
||||
cmdline-tools-version: 9.0
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd examples/whisper.android.java
|
||||
chmod +x ./gradlew
|
||||
./gradlew assembleRelease
|
||||
# TODO: disable because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/11019444420/job/30627193602
|
||||
# android_java:
|
||||
# runs-on: ubuntu-22.04
|
||||
#
|
||||
# steps:
|
||||
# - name: Clone
|
||||
# uses: actions/checkout@v4
|
||||
#
|
||||
# - name: set up JDK 11
|
||||
# uses: actions/setup-java@v4
|
||||
# with:
|
||||
# java-version: '11'
|
||||
# distribution: 'temurin'
|
||||
# cache: gradle
|
||||
#
|
||||
# - name: Setup Android SDK
|
||||
# uses: android-actions/setup-android@v3
|
||||
# with:
|
||||
# cmdline-tools-version: 9.0
|
||||
#
|
||||
# - name: Build
|
||||
# run: |
|
||||
# cd examples/whisper.android.java
|
||||
# chmod +x ./gradlew
|
||||
# ./gradlew assembleRelease
|
||||
|
||||
# TODO: disabled because of following fail: https://github.com/ggerganov/whisper.cpp/actions/runs/9686220096/job/26735899598
|
||||
# java:
|
||||
@ -1041,8 +807,6 @@ jobs:
|
||||
# PGP_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE }}
|
||||
|
||||
quantize:
|
||||
if: ${{ github.event_name == 'push' || github.event_name == 'pull_request' ||
|
||||
github.event.inputs.run_type == 'full-ci' }}
|
||||
runs-on: ubuntu-22.04
|
||||
|
||||
steps:
|
||||
@ -1055,95 +819,3 @@ jobs:
|
||||
cmake -B build
|
||||
cmake --build build --config Release
|
||||
./build/bin/quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0
|
||||
|
||||
release:
|
||||
if: ${{ github.event.inputs.create_release == 'true' || github.event.inputs.pre_release_tag != '' }}
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
needs:
|
||||
- determine-tag
|
||||
- ios-xcode-build
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: ccache
|
||||
uses: hendrikmuhs/ccache-action@v1.2.16
|
||||
with:
|
||||
key: release
|
||||
evict-old-files: 1d
|
||||
|
||||
# Downloads all the artifacts from the previous jobs
|
||||
- name: Download artifacts
|
||||
id: download-artifact
|
||||
uses: actions/download-artifact@v4
|
||||
with:
|
||||
path: ./artifact
|
||||
|
||||
- name: Move artifacts
|
||||
id: move_artifacts
|
||||
run: mkdir -p ./artifact/release && mv ./artifact/*/*.zip ./artifact/release
|
||||
|
||||
- name: Create release
|
||||
id: create_release
|
||||
uses: ggml-org/action-create-release@v1
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
with:
|
||||
tag_name: ${{ needs.determine-tag.outputs.tag_name }}
|
||||
prerelease: ${{ github.event.inputs.pre_release_tag != '' }}
|
||||
|
||||
- name: Upload release
|
||||
id: upload_release
|
||||
uses: actions/github-script@v3
|
||||
with:
|
||||
github-token: ${{secrets.GITHUB_TOKEN}}
|
||||
script: |
|
||||
const path = require('path');
|
||||
const fs = require('fs');
|
||||
const release_id = '${{ steps.create_release.outputs.id }}';
|
||||
for (let file of await fs.readdirSync('./artifact/release')) {
|
||||
if (path.extname(file) === '.zip') {
|
||||
console.log('uploadReleaseAsset', file);
|
||||
await github.repos.uploadReleaseAsset({
|
||||
owner: context.repo.owner,
|
||||
repo: context.repo.repo,
|
||||
release_id: release_id,
|
||||
name: file,
|
||||
data: await fs.readFileSync(`./artifact/release/${file}`)
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
coreml-base-en:
|
||||
if: ${{ (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
|
||||
github.event.inputs.create_release == 'true' ||
|
||||
github.event.inputs.pre_release_tag != '' }}
|
||||
runs-on: macos-latest
|
||||
needs: determine-tag
|
||||
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set environment variables
|
||||
id: set_vars
|
||||
run: |
|
||||
echo "MODEL_NAME=base.en" >> $GITHUB_ENV
|
||||
echo "GEN_MODEL_NAME=whisper-${{ needs.determine-tag.outputs.tag_name }}-ggml-base.en-encoder.mlmodelc" >> $GITHUB_ENV
|
||||
|
||||
- name: Download model
|
||||
run: |
|
||||
./models/download-ggml-model.sh ${{ env.MODEL_NAME }}
|
||||
|
||||
- name: Generate CoreML model
|
||||
run: |
|
||||
python3.11 -m venv venv
|
||||
source venv/bin/activate
|
||||
pip install ane_transformers openai-whisper coremltools
|
||||
./models/generate-coreml-model.sh ${{ env.MODEL_NAME }}
|
||||
|
91
.github/workflows/examples-wasm.yml
vendored
91
.github/workflows/examples-wasm.yml
vendored
@ -1,91 +0,0 @@
|
||||
name: Examples WASM
|
||||
on:
|
||||
push:
|
||||
branches: ["master"]
|
||||
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
|
||||
concurrency:
|
||||
group: "pages"
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
deploy-wasm-github-pages:
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Setup Pages
|
||||
uses: actions/configure-pages@v4
|
||||
|
||||
- name: Setup emsdk
|
||||
uses: mymindstorm/setup-emsdk@v14
|
||||
|
||||
- name: Build WASM Examples
|
||||
# Enable for real build later in whisper.cpp
|
||||
run: |
|
||||
mkdir -p build-em && cd build-em
|
||||
emcmake cmake .. -DCMAKE_BUILD_TYPE=Release
|
||||
make -j
|
||||
|
||||
- name: Create staging directory
|
||||
run: mkdir -p staging
|
||||
|
||||
- name: Create .nojekyll file in staging directory
|
||||
run: touch staging/.nojekyll
|
||||
|
||||
- name: Copy application files
|
||||
run: |
|
||||
build_dir=build-em/bin
|
||||
|
||||
ls ${build_dir}
|
||||
|
||||
# command.wasm
|
||||
target_dir=staging/command.wasm
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/command.wasm/{index.html,command.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libcommand.js ${target_dir}
|
||||
|
||||
# bench.wasm
|
||||
target_dir=staging/bench.wasm
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/bench.wasm/{index.html,bench.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libbench.js ${target_dir}
|
||||
|
||||
# stream.wasm
|
||||
target_dir=staging/stream.wasm
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/stream.wasm/{index.html,stream.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libstream.js ${target_dir}
|
||||
|
||||
# whisper.wasm (this will be the main example page)
|
||||
target_dir=staging
|
||||
mkdir -p ${target_dir}
|
||||
cp ${build_dir}/whisper.wasm/{index.html,main.js,helpers.js} ${target_dir}
|
||||
cp ${build_dir}/libmain.js ${target_dir}
|
||||
|
||||
# Copy Cross-Origin Isolation service worker
|
||||
cp -v examples/coi-serviceworker.js staging/
|
||||
|
||||
- name: List files in staging directory (for debugging)
|
||||
run: |
|
||||
echo "Files in staging directory:"
|
||||
find staging -type f | sort
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-pages-artifact@v3
|
||||
with:
|
||||
path: ./staging
|
||||
|
||||
- name: Deploy to GitHub Pages
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v4
|
@ -1,6 +1,6 @@
|
||||
cmake_minimum_required(VERSION 3.5) # for add_link_options and implicit target directories.
|
||||
project("whisper.cpp" C CXX)
|
||||
project("whisper.cpp" VERSION 1.7.5)
|
||||
project("whisper.cpp" VERSION 1.7.4)
|
||||
include(CheckIncludeFileCXX)
|
||||
|
||||
set(SOVERSION 1)
|
||||
@ -38,13 +38,8 @@ if (EMSCRIPTEN)
|
||||
|
||||
# TODO: without these, we get the following error:
|
||||
# wasm-ld: error: --shared-memory is disallowed by whisper.cpp.o because it was not compiled with 'atomics' or 'bulk-memory' features.
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread")
|
||||
|
||||
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -s TOTAL_STACK=5242880")
|
||||
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s TOTAL_STACK=5242880")
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-deprecated")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -s TOTAL_STACK=5242880")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread -s TOTAL_STACK=5242880")
|
||||
else()
|
||||
if (MINGW)
|
||||
set(BUILD_SHARED_LIBS_DEFAULT OFF)
|
||||
@ -67,8 +62,7 @@ option(WHISPER_ALL_WARNINGS "whisper: enable all compiler warnings"
|
||||
option(WHISPER_ALL_WARNINGS_3RD_PARTY "whisper: enable all compiler warnings in 3rd party libs" OFF)
|
||||
|
||||
# build
|
||||
option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)
|
||||
option(WHISPER_USE_SYSTEM_GGML "whisper: use system-installed GGML library" OFF)
|
||||
option(WHISPER_FATAL_WARNINGS "whisper: enable -Werror flag" OFF)
|
||||
|
||||
# sanitizers
|
||||
option(WHISPER_SANITIZE_THREAD "whisper: enable thread sanitizer" OFF)
|
||||
@ -127,15 +121,7 @@ whisper_option_depr(WARNING WHISPER_SYCL_F16 GGML_SYCL_F16)
|
||||
#
|
||||
|
||||
if (NOT TARGET ggml)
|
||||
if (WHISPER_USE_SYSTEM_GGML)
|
||||
find_package(ggml REQUIRED)
|
||||
if (NOT ggml_FOUND)
|
||||
message(FATAL_ERROR "System-installed GGML library not found.")
|
||||
endif()
|
||||
add_library(ggml ALIAS ggml::ggml)
|
||||
else()
|
||||
add_subdirectory(ggml)
|
||||
endif()
|
||||
add_subdirectory(ggml)
|
||||
# ... otherwise assume ggml is added by a parent CMakeLists.txt
|
||||
endif()
|
||||
add_subdirectory(src)
|
||||
@ -190,8 +176,8 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/whisper.pc"
|
||||
#
|
||||
|
||||
if (WHISPER_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
|
||||
include(CTest)
|
||||
add_subdirectory(tests)
|
||||
#include(CTest)
|
||||
#add_subdirectory(tests)
|
||||
endif ()
|
||||
|
||||
if (WHISPER_BUILD_EXAMPLES)
|
||||
|
99
README.md
99
README.md
@ -2,12 +2,15 @@
|
||||
|
||||

|
||||
|
||||
[](https://github.com/ggml-org/whisper.cpp/actions)
|
||||
[](https://github.com/ggerganov/whisper.cpp/actions)
|
||||
[](https://opensource.org/licenses/MIT)
|
||||
[](https://conan.io/center/whisper-cpp)
|
||||
[](https://www.npmjs.com/package/whisper.cpp/)
|
||||
|
||||
Stable: [v1.7.5](https://github.com/ggml-org/whisper.cpp/releases/tag/v1.7.5) / [Roadmap](https://github.com/orgs/ggml-org/projects/4/)
|
||||
> [!NOTE]
|
||||
> New maintenance roadmap: https://github.com/ggerganov/whisper.cpp/discussions/2788
|
||||
|
||||
Stable: [v1.7.4](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.7.4) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
|
||||
|
||||
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
|
||||
|
||||
@ -23,7 +26,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
|
||||
- [Efficient GPU support for NVIDIA](#nvidia-gpu-support)
|
||||
- [OpenVINO Support](#openvino-support)
|
||||
- [Ascend NPU Support](#ascend-npu-support)
|
||||
- [C-style API](https://github.com/ggml-org/whisper.cpp/blob/master/include/whisper.h)
|
||||
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/include/whisper.h)
|
||||
|
||||
Supported platforms:
|
||||
|
||||
@ -31,14 +34,14 @@ Supported platforms:
|
||||
- [x] [iOS](examples/whisper.objc)
|
||||
- [x] [Android](examples/whisper.android)
|
||||
- [x] [Java](bindings/java/README.md)
|
||||
- [x] Linux / [FreeBSD](https://github.com/ggml-org/whisper.cpp/issues/56#issuecomment-1350920264)
|
||||
- [x] Linux / [FreeBSD](https://github.com/ggerganov/whisper.cpp/issues/56#issuecomment-1350920264)
|
||||
- [x] [WebAssembly](examples/whisper.wasm)
|
||||
- [x] Windows ([MSVC](https://github.com/ggml-org/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggml-org/whisper.cpp/issues/168)]
|
||||
- [x] [Raspberry Pi](https://github.com/ggml-org/whisper.cpp/discussions/166)
|
||||
- [x] [Docker](https://github.com/ggml-org/whisper.cpp/pkgs/container/whisper.cpp)
|
||||
- [x] Windows ([MSVC](https://github.com/ggerganov/whisper.cpp/blob/master/.github/workflows/build.yml#L117-L144) and [MinGW](https://github.com/ggerganov/whisper.cpp/issues/168)]
|
||||
- [x] [Raspberry Pi](https://github.com/ggerganov/whisper.cpp/discussions/166)
|
||||
- [x] [Docker](https://github.com/ggerganov/whisper.cpp/pkgs/container/whisper.cpp)
|
||||
|
||||
The entire high-level implementation of the model is contained in [whisper.h](include/whisper.h) and [whisper.cpp](src/whisper.cpp).
|
||||
The rest of the code is part of the [`ggml`](https://github.com/ggml-org/ggml) machine learning library.
|
||||
The rest of the code is part of the [`ggml`](https://github.com/ggerganov/ggml) machine learning library.
|
||||
|
||||
Having such a lightweight implementation of the model allows to easily integrate it in different platforms and applications.
|
||||
As an example, here is a video of running the model on an iPhone 13 device - fully offline, on-device: [whisper.objc](examples/whisper.objc)
|
||||
@ -51,14 +54,14 @@ https://user-images.githubusercontent.com/1991296/204038393-2f846eae-c255-4099-a
|
||||
|
||||
On Apple Silicon, the inference runs fully on the GPU via Metal:
|
||||
|
||||
https://github.com/ggml-org/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
|
||||
https://github.com/ggerganov/whisper.cpp/assets/1991296/c82e8f86-60dc-49f2-b048-d2fdbd6b5225
|
||||
|
||||
## Quick start
|
||||
|
||||
First clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
```
|
||||
|
||||
Navigate into the directory:
|
||||
@ -149,7 +152,6 @@ standard cmake setup with:
|
||||
cmake -B build -DGGML_BLAS=1
|
||||
cmake --build build --config Release
|
||||
./build/bin/whisper-cli [ .. etc .. ]
|
||||
```
|
||||
|
||||
## Quantization
|
||||
|
||||
@ -182,11 +184,11 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
|
||||
```
|
||||
|
||||
- To ensure `coremltools` operates correctly, please confirm that [Xcode](https://developer.apple.com/xcode/) is installed and execute `xcode-select --install` to install the command-line tools.
|
||||
- Python 3.11 is recommended.
|
||||
- Python 3.10 is recommended.
|
||||
- MacOS Sonoma (version 14) or newer is recommended, as older versions of MacOS might experience issues with transcription hallucination.
|
||||
- [OPTIONAL] It is recommended to utilize a Python version management system, such as [Miniconda](https://docs.conda.io/en/latest/miniconda.html) for this step:
|
||||
- To create an environment, use: `conda create -n py311-whisper python=3.11 -y`
|
||||
- To activate the environment, use: `conda activate py311-whisper`
|
||||
- To create an environment, use: `conda create -n py310-whisper python=3.10 -y`
|
||||
- To activate the environment, use: `conda activate py310-whisper`
|
||||
|
||||
- Generate a Core ML model. For example, to generate a `base.en` model, use:
|
||||
|
||||
@ -223,7 +225,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
|
||||
The first run on a device is slow, since the ANE service compiles the Core ML model to some device-specific format.
|
||||
Next runs are faster.
|
||||
|
||||
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggml-org/whisper.cpp/pull/566).
|
||||
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
|
||||
|
||||
## OpenVINO support
|
||||
|
||||
@ -308,7 +310,7 @@ This can result in significant speedup in encoder performance. Here are the inst
|
||||
The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
|
||||
cached for the next run.
|
||||
|
||||
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggml-org/whisper.cpp/pull/1037).
|
||||
For more information about the OpenVINO implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
|
||||
|
||||
## NVIDIA GPU support
|
||||
|
||||
@ -386,8 +388,8 @@ Run the inference examples as usual, for example:
|
||||
|
||||
We have two Docker images available for this project:
|
||||
|
||||
1. `ghcr.io/ggml-org/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggml-org/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
1. `ghcr.io/ggerganov/whisper.cpp:main`: This image includes the main executable file as well as `curl` and `ffmpeg`. (platforms: `linux/amd64`, `linux/arm64`)
|
||||
2. `ghcr.io/ggerganov/whisper.cpp:main-cuda`: Same as `main` but compiled with CUDA support. (platforms: `linux/amd64`)
|
||||
|
||||
### Usage
|
||||
|
||||
@ -425,8 +427,7 @@ For detailed instructions on how to use Conan, please refer to the [Conan docume
|
||||
|
||||
This is a naive example of performing real-time inference on audio from your microphone.
|
||||
The [stream](examples/stream) tool samples the audio every half a second and runs the transcription continuously.
|
||||
More info is available in [issue #10](https://github.com/ggml-org/whisper.cpp/issues/10).
|
||||
You will need to have [sdl2](https://wiki.libsdl.org/SDL2/Installation) installed for it to work properly.
|
||||
More info is available in [issue #10](https://github.com/ggerganov/whisper.cpp/issues/10).
|
||||
|
||||
```bash
|
||||
cmake -B build -DWHISPER_SDL2=ON
|
||||
@ -514,7 +515,7 @@ main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 pr
|
||||
|
||||
## Speaker segmentation via tinydiarize (experimental)
|
||||
|
||||
More information about this approach is available here: https://github.com/ggml-org/whisper.cpp/pull/1058
|
||||
More information about this approach is available here: https://github.com/ggerganov/whisper.cpp/pull/1058
|
||||
|
||||
Sample usage:
|
||||
|
||||
@ -578,7 +579,7 @@ https://user-images.githubusercontent.com/1991296/199337538-b7b0c7a3-2753-4a88-a
|
||||
|
||||
## Video comparison of different models
|
||||
|
||||
Use the [scripts/bench-wts.sh](https://github.com/ggml-org/whisper.cpp/blob/master/scripts/bench-wts.sh) script to generate a video in the following format:
|
||||
Use the [scripts/bench-wts.sh](https://github.com/ggerganov/whisper.cpp/blob/master/scripts/bench-wts.sh) script to generate a video in the following format:
|
||||
|
||||
```bash
|
||||
./scripts/bench-wts.sh samples/jfk.wav
|
||||
@ -595,7 +596,7 @@ In order to have an objective comparison of the performance of the inference acr
|
||||
use the [whisper-bench](examples/bench) tool. The tool simply runs the Encoder part of the model and prints how much time it
|
||||
took to execute it. The results are summarized in the following Github issue:
|
||||
|
||||
[Benchmark results](https://github.com/ggml-org/whisper.cpp/issues/89)
|
||||
[Benchmark results](https://github.com/ggerganov/whisper.cpp/issues/89)
|
||||
|
||||
Additionally a script to run whisper.cpp with different models and audio files is provided [bench.py](scripts/bench.py).
|
||||
|
||||
@ -622,24 +623,25 @@ You can download the converted models using the [models/download-ggml-model.sh](
|
||||
or manually from here:
|
||||
|
||||
- https://huggingface.co/ggerganov/whisper.cpp
|
||||
- https://ggml.ggerganov.com
|
||||
|
||||
For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or [models/README.md](models/README.md).
|
||||
|
||||
## [Bindings](https://github.com/ggml-org/whisper.cpp/discussions/categories/bindings)
|
||||
## [Bindings](https://github.com/ggerganov/whisper.cpp/discussions/categories/bindings)
|
||||
|
||||
- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggml-org/whisper.cpp/discussions/310)
|
||||
- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggml-org/whisper.cpp/discussions/309)
|
||||
- [x] Rust: [tazz4843/whisper-rs](https://github.com/tazz4843/whisper-rs) | [#310](https://github.com/ggerganov/whisper.cpp/discussions/310)
|
||||
- [x] JavaScript: [bindings/javascript](bindings/javascript) | [#309](https://github.com/ggerganov/whisper.cpp/discussions/309)
|
||||
- React Native (iOS / Android): [whisper.rn](https://github.com/mybigday/whisper.rn)
|
||||
- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggml-org/whisper.cpp/discussions/312)
|
||||
- [x] Go: [bindings/go](bindings/go) | [#312](https://github.com/ggerganov/whisper.cpp/discussions/312)
|
||||
- [x] Java:
|
||||
- [GiviMAD/whisper-jni](https://github.com/GiviMAD/whisper-jni)
|
||||
- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggml-org/whisper.cpp/discussions/507)
|
||||
- [x] Objective-C / Swift: [ggml-org/whisper.spm](https://github.com/ggml-org/whisper.spm) | [#313](https://github.com/ggml-org/whisper.cpp/discussions/313)
|
||||
- [x] Ruby: [bindings/ruby](bindings/ruby) | [#507](https://github.com/ggerganov/whisper.cpp/discussions/507)
|
||||
- [x] Objective-C / Swift: [ggerganov/whisper.spm](https://github.com/ggerganov/whisper.spm) | [#313](https://github.com/ggerganov/whisper.cpp/discussions/313)
|
||||
- [exPHAT/SwiftWhisper](https://github.com/exPHAT/SwiftWhisper)
|
||||
- [x] .NET: | [#422](https://github.com/ggml-org/whisper.cpp/discussions/422)
|
||||
- [x] .NET: | [#422](https://github.com/ggerganov/whisper.cpp/discussions/422)
|
||||
- [sandrohanea/whisper.net](https://github.com/sandrohanea/whisper.net)
|
||||
- [NickDarvey/whisper](https://github.com/NickDarvey/whisper)
|
||||
- [x] Python: | [#9](https://github.com/ggml-org/whisper.cpp/issues/9)
|
||||
- [x] Python: | [#9](https://github.com/ggerganov/whisper.cpp/issues/9)
|
||||
- [stlukey/whispercpp.py](https://github.com/stlukey/whispercpp.py) (Cython)
|
||||
- [AIWintermuteAI/whispercpp](https://github.com/AIWintermuteAI/whispercpp) (Updated fork of aarnphm/whispercpp)
|
||||
- [aarnphm/whispercpp](https://github.com/aarnphm/whispercpp) (Pybind11)
|
||||
@ -647,33 +649,6 @@ For more details, see the conversion script [models/convert-pt-to-ggml.py](model
|
||||
- [x] R: [bnosac/audio.whisper](https://github.com/bnosac/audio.whisper)
|
||||
- [x] Unity: [macoron/whisper.unity](https://github.com/Macoron/whisper.unity)
|
||||
|
||||
## XCFramework
|
||||
The XCFramework is a precompiled version of the library for iOS, visionOS, tvOS,
|
||||
and macOS. It can be used in Swift projects without the need to compile the
|
||||
library from source. For examples:
|
||||
```swift
|
||||
// swift-tools-version: 5.10
|
||||
// The swift-tools-version declares the minimum version of Swift required to build this package.
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "Whisper",
|
||||
targets: [
|
||||
.executableTarget(
|
||||
name: "Whisper",
|
||||
dependencies: [
|
||||
"WhisperFramework"
|
||||
]),
|
||||
.binaryTarget(
|
||||
name: "WhisperFramework",
|
||||
url: "https://github.com/ggml-org/whisper.cpp/releases/download/v1.7.5/whisper-v1.7.5-xcframework.zip",
|
||||
checksum: "c7faeb328620d6012e130f3d705c51a6ea6c995605f2df50f6e1ad68c59c6c4a"
|
||||
)
|
||||
]
|
||||
)
|
||||
```
|
||||
|
||||
## Examples
|
||||
|
||||
There are various examples of using the library for different projects in the [examples](examples) folder.
|
||||
@ -692,13 +667,13 @@ Some of the examples are even ported to run in the browser using WebAssembly. Ch
|
||||
| [whisper.android](examples/whisper.android) | | Android mobile application using whisper.cpp |
|
||||
| [whisper.nvim](examples/whisper.nvim) | | Speech-to-text plugin for Neovim |
|
||||
| [generate-karaoke.sh](examples/generate-karaoke.sh) | | Helper script to easily [generate a karaoke video](https://youtu.be/uj7hVta4blM) of raw audio capture |
|
||||
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggml-org/whisper.cpp/issues/185) |
|
||||
| [livestream.sh](examples/livestream.sh) | | [Livestream audio transcription](https://github.com/ggerganov/whisper.cpp/issues/185) |
|
||||
| [yt-wsp.sh](examples/yt-wsp.sh) | | Download + transcribe and/or translate any VOD [(original)](https://gist.github.com/DaniruKun/96f763ec1a037cc92fe1a059b643b818) |
|
||||
| [wchess](examples/wchess) | [wchess.wasm](examples/wchess) | Voice-controlled chess |
|
||||
|
||||
## [Discussions](https://github.com/ggml-org/whisper.cpp/discussions)
|
||||
## [Discussions](https://github.com/ggerganov/whisper.cpp/discussions)
|
||||
|
||||
If you have any kind of feedback about this project feel free to use the Discussions section and open a new topic.
|
||||
You can use the [Show and tell](https://github.com/ggml-org/whisper.cpp/discussions/categories/show-and-tell) category
|
||||
You can use the [Show and tell](https://github.com/ggerganov/whisper.cpp/discussions/categories/show-and-tell) category
|
||||
to share your own projects that use `whisper.cpp`. If you have a question, make sure to check the
|
||||
[Frequently asked questions (#126)](https://github.com/ggml-org/whisper.cpp/discussions/126) discussion.
|
||||
[Frequently asked questions (#126)](https://github.com/ggerganov/whisper.cpp/discussions/126) discussion.
|
||||
|
@ -11,11 +11,11 @@ UNAME_M := $(shell uname -m)
|
||||
endif
|
||||
|
||||
GGML_METAL_PATH_RESOURCES := $(abspath ../..)
|
||||
BUILD_DIR := build_go
|
||||
BUILD_DIR := build
|
||||
MODELS_DIR := models
|
||||
EXAMPLES_DIR := $(wildcard examples/*)
|
||||
INCLUDE_PATH := $(abspath ../../include):$(abspath ../../ggml/include)
|
||||
LIBRARY_PATH := $(abspath ../../${BUILD_DIR}/src:$(abspath ../../${BUILD_DIR}/ggml/src))
|
||||
LIBRARY_PATH := $(abspath ../..)
|
||||
|
||||
ifeq ($(GGML_CUDA),1)
|
||||
LIBRARY_PATH := $(LIBRARY_PATH):$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib/
|
||||
@ -29,10 +29,8 @@ endif
|
||||
all: clean whisper examples
|
||||
|
||||
whisper: mkdir
|
||||
cmake -S ../.. -B ../../${BUILD_DIR} \
|
||||
-DCMAKE_BUILD_TYPE=Release \
|
||||
-DBUILD_SHARED_LIBS=OFF
|
||||
cmake --build ../../${BUILD_DIR} --target whisper
|
||||
@echo Build whisper
|
||||
@${MAKE} -C ../.. libwhisper.a
|
||||
|
||||
test: model-small whisper modtidy
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
|
@ -31,7 +31,7 @@ func main() {
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := context.Process(samples, nil, nil, nil); err != nil {
|
||||
if err := context.Process(samples, nil, nil); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@ -51,7 +51,7 @@ func main() {
|
||||
In order to build, you need to have the Go compiler installed. You can get it from [here](https://golang.org/dl/). Run the tests with:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
cd whisper.cpp/bindings/go
|
||||
make test
|
||||
```
|
||||
@ -98,7 +98,7 @@ The API Documentation:
|
||||
|
||||
Getting help:
|
||||
|
||||
* Follow the discussion for the go bindings [here](https://github.com/ggml-org/whisper.cpp/discussions/312)
|
||||
* Follow the discussion for the go bindings [here](https://github.com/ggerganov/whisper.cpp/discussions/312)
|
||||
|
||||
## License
|
||||
|
||||
|
@ -1,5 +1,5 @@
|
||||
/*
|
||||
github.com/ggml-org/whisper.cpp/bindings/go
|
||||
github.com/ggerganov/whisper.cpp/bindings/go
|
||||
provides a speech-to-text service bindings for the Go programming language.
|
||||
*/
|
||||
package whisper
|
||||
|
@ -67,7 +67,7 @@ func Process(model whisper.Model, path string, flags *Flags) error {
|
||||
// Process the data
|
||||
fmt.Fprintf(flags.Output(), " ...processing %q\n", path)
|
||||
context.ResetTimings()
|
||||
if err := context.Process(data, nil, cb, nil); err != nil {
|
||||
if err := context.Process(data, cb, nil); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -71,10 +71,6 @@ func (context *context) Language() string {
|
||||
return whisper.Whisper_lang_str(context.params.Language())
|
||||
}
|
||||
|
||||
func (context *context) DetectedLanguage() string {
|
||||
return whisper.Whisper_lang_str(context.model.ctx.Whisper_full_lang_id())
|
||||
}
|
||||
|
||||
// Set translate flag
|
||||
func (context *context) SetTranslate(v bool) {
|
||||
context.params.SetTranslate(v)
|
||||
@ -193,7 +189,6 @@ func (context *context) WhisperLangAutoDetect(offset_ms int, n_threads int) ([]f
|
||||
// Process new sample data and return any errors
|
||||
func (context *context) Process(
|
||||
data []float32,
|
||||
callEncoderBegin EncoderBeginCallback,
|
||||
callNewSegment SegmentCallback,
|
||||
callProgress ProgressCallback,
|
||||
) error {
|
||||
@ -208,20 +203,7 @@ func (context *context) Process(
|
||||
// We don't do parallel processing at the moment
|
||||
processors := 0
|
||||
if processors > 1 {
|
||||
if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, callEncoderBegin,
|
||||
func(new int) {
|
||||
if callNewSegment != nil {
|
||||
num_segments := context.model.ctx.Whisper_full_n_segments()
|
||||
s0 := num_segments - new
|
||||
for i := s0; i < num_segments; i++ {
|
||||
callNewSegment(toSegment(context.model.ctx, i))
|
||||
}
|
||||
}
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if err := context.model.ctx.Whisper_full(context.params, data, callEncoderBegin,
|
||||
func(new int) {
|
||||
if err := context.model.ctx.Whisper_full_parallel(context.params, data, processors, nil, func(new int) {
|
||||
if callNewSegment != nil {
|
||||
num_segments := context.model.ctx.Whisper_full_n_segments()
|
||||
s0 := num_segments - new
|
||||
@ -229,11 +211,22 @@ func (context *context) Process(
|
||||
callNewSegment(toSegment(context.model.ctx, i))
|
||||
}
|
||||
}
|
||||
}, func(progress int) {
|
||||
if callProgress != nil {
|
||||
callProgress(progress)
|
||||
}
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if err := context.model.ctx.Whisper_full(context.params, data, nil, func(new int) {
|
||||
if callNewSegment != nil {
|
||||
num_segments := context.model.ctx.Whisper_full_n_segments()
|
||||
s0 := num_segments - new
|
||||
for i := s0; i < num_segments; i++ {
|
||||
callNewSegment(toSegment(context.model.ctx, i))
|
||||
}
|
||||
}
|
||||
}, func(progress int) {
|
||||
if callProgress != nil {
|
||||
callProgress(progress)
|
||||
}
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -88,37 +88,6 @@ func TestProcess(t *testing.T) {
|
||||
context, err := model.NewContext()
|
||||
assert.NoError(err)
|
||||
|
||||
err = context.Process(data, nil, nil, nil)
|
||||
err = context.Process(data, nil, nil)
|
||||
assert.NoError(err)
|
||||
}
|
||||
|
||||
func TestDetectedLanguage(t *testing.T) {
|
||||
assert := assert.New(t)
|
||||
|
||||
fh, err := os.Open(SamplePath)
|
||||
assert.NoError(err)
|
||||
defer fh.Close()
|
||||
|
||||
// Decode the WAV file - load the full buffer
|
||||
dec := wav.NewDecoder(fh)
|
||||
buf, err := dec.FullPCMBuffer()
|
||||
assert.NoError(err)
|
||||
assert.Equal(uint16(1), dec.NumChans)
|
||||
|
||||
data := buf.AsFloat32Buffer().Data
|
||||
|
||||
model, err := whisper.New(ModelPath)
|
||||
assert.NoError(err)
|
||||
assert.NotNil(model)
|
||||
defer model.Close()
|
||||
|
||||
context, err := model.NewContext()
|
||||
assert.NoError(err)
|
||||
|
||||
err = context.Process(data, nil, nil, nil)
|
||||
assert.NoError(err)
|
||||
|
||||
expectedLanguage := "en"
|
||||
actualLanguage := context.DetectedLanguage()
|
||||
assert.Equal(expectedLanguage, actualLanguage)
|
||||
}
|
||||
|
@ -16,10 +16,6 @@ type SegmentCallback func(Segment)
|
||||
// processing. It is called during the Process function
|
||||
type ProgressCallback func(int)
|
||||
|
||||
// EncoderBeginCallback is the callback function for checking if we want to
|
||||
// continue processing. It is called during the Process function
|
||||
type EncoderBeginCallback func() bool
|
||||
|
||||
// Model is the interface to a whisper model. Create a new model with the
|
||||
// function whisper.New(string)
|
||||
type Model interface {
|
||||
@ -35,13 +31,12 @@ type Model interface {
|
||||
Languages() []string
|
||||
}
|
||||
|
||||
// Context is the speech recognition context.
|
||||
// Context is the speach recognition context.
|
||||
type Context interface {
|
||||
SetLanguage(string) error // Set the language to use for speech recognition, use "auto" for auto detect language.
|
||||
SetTranslate(bool) // Set translate flag
|
||||
IsMultilingual() bool // Return true if the model is multilingual.
|
||||
Language() string // Get language
|
||||
DetectedLanguage() string // Get detected language
|
||||
|
||||
SetOffset(time.Duration) // Set offset
|
||||
SetDuration(time.Duration) // Set duration
|
||||
@ -63,7 +58,7 @@ type Context interface {
|
||||
// Process mono audio data and return any errors.
|
||||
// If defined, newly generated segments are passed to the
|
||||
// callback function during processing.
|
||||
Process([]float32, EncoderBeginCallback, SegmentCallback, ProgressCallback) error
|
||||
Process([]float32, SegmentCallback, ProgressCallback) error
|
||||
|
||||
// After process is called, return segments until the end of the stream
|
||||
// is reached, when io.EOF is returned.
|
||||
|
@ -9,7 +9,7 @@ import (
|
||||
// CGO
|
||||
|
||||
/*
|
||||
#cgo LDFLAGS: -lwhisper -lggml -lggml-base -lggml-cpu -lm -lstdc++ -fopenmp
|
||||
#cgo LDFLAGS: -lwhisper -lm -lstdc++ -fopenmp
|
||||
#cgo darwin LDFLAGS: -framework Accelerate -framework Metal -framework Foundation -framework CoreGraphics
|
||||
#include <whisper.h>
|
||||
#include <stdlib.h>
|
||||
|
@ -31,10 +31,10 @@ public class Example {
|
||||
var whisperParams = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
// custom configuration if required
|
||||
whisperParams.temperature_inc = 0f;
|
||||
|
||||
|
||||
var samples = readAudio(); // divide each value by 32767.0f
|
||||
whisper.fullTranscribe(whisperParams, samples);
|
||||
|
||||
|
||||
int segmentCount = whisper.getTextSegmentCount(context);
|
||||
for (int i = 0; i < segmentCount; i++) {
|
||||
String text = whisper.getTextSegment(context, i);
|
||||
@ -52,7 +52,7 @@ public class Example {
|
||||
In order to build, you need to have the JDK 8 or higher installed. Run the tests with:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
cd whisper.cpp/bindings/java
|
||||
|
||||
./gradlew build
|
||||
|
@ -25,13 +25,13 @@ sourceSets {
|
||||
}
|
||||
|
||||
tasks.register('copyLibwhisperDynlib', Copy) {
|
||||
from '../../build/src'
|
||||
include 'libwhisper.dylib'
|
||||
from '../../build'
|
||||
include 'libwhisper.dynlib'
|
||||
into 'build/generated/resources/main/darwin'
|
||||
}
|
||||
|
||||
tasks.register('copyLibwhisperSo', Copy) {
|
||||
from '../../build/src'
|
||||
from '../../build'
|
||||
include 'libwhisper.so'
|
||||
into 'build/generated/resources/main/linux-x86-64'
|
||||
}
|
||||
@ -55,12 +55,7 @@ java {
|
||||
withJavadocJar()
|
||||
}
|
||||
|
||||
sourcesJar() {
|
||||
dependsOn copyLibs
|
||||
}
|
||||
|
||||
jar {
|
||||
dependsOn copyLibs
|
||||
exclude '**/whisper_java.exp', '**/whisper_java.lib'
|
||||
}
|
||||
|
||||
@ -72,9 +67,6 @@ tasks.withType(Test) {
|
||||
useJUnitPlatform()
|
||||
}
|
||||
|
||||
test.dependsOn copyLibs
|
||||
processResources.dependsOn copyLibs
|
||||
|
||||
dependencies {
|
||||
implementation "net.java.dev.jna:jna:5.13.0"
|
||||
testImplementation "org.junit.jupiter:junit-jupiter:5.9.2"
|
||||
|
0
bindings/java/gradlew
vendored
Executable file → Normal file
0
bindings/java/gradlew
vendored
Executable file → Normal file
@ -1,24 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp;
|
||||
|
||||
/**
|
||||
* Presets for alignment heads in DTW token timestamps
|
||||
*/
|
||||
public class WhisperConstants {
|
||||
// Alignment heads presets
|
||||
public static final int WHISPER_AHEADS_NONE = 0;
|
||||
public static final int WHISPER_AHEADS_TINY_EN = 1;
|
||||
public static final int WHISPER_AHEADS_TINY = 2;
|
||||
public static final int WHISPER_AHEADS_BASE_EN = 3;
|
||||
public static final int WHISPER_AHEADS_BASE = 4;
|
||||
public static final int WHISPER_AHEADS_SMALL_EN = 5;
|
||||
public static final int WHISPER_AHEADS_SMALL = 6;
|
||||
public static final int WHISPER_AHEADS_MEDIUM_EN = 7;
|
||||
public static final int WHISPER_AHEADS_MEDIUM = 8;
|
||||
public static final int WHISPER_AHEADS_LARGE_V1 = 9;
|
||||
public static final int WHISPER_AHEADS_LARGE_V2 = 10;
|
||||
public static final int WHISPER_AHEADS_LARGE_V3 = 11;
|
||||
public static final int WHISPER_AHEADS_LARGE_V3_TURBO = 12;
|
||||
public static final int WHISPER_AHEADS_CUSTOM = 13;
|
||||
public static final int WHISPER_AHEADS_N_TOP_MOST = 14;
|
||||
public static final int WHISPER_AHEADS_COUNT = 15;
|
||||
}
|
@ -1,9 +1,7 @@
|
||||
package io.github.ggerganov.whispercpp;
|
||||
|
||||
import com.sun.jna.NativeLong;
|
||||
import com.sun.jna.Structure;
|
||||
import com.sun.jna.ptr.PointerByReference;
|
||||
import com.sun.jna.Pointer;
|
||||
import io.github.ggerganov.whispercpp.ggml.GgmlType;
|
||||
import io.github.ggerganov.whispercpp.WhisperModel;
|
||||
import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
||||
@ -11,26 +9,33 @@ import io.github.ggerganov.whispercpp.params.WhisperContextParams;
|
||||
import java.util.List;
|
||||
|
||||
public class WhisperContext extends Structure {
|
||||
public NativeLong t_load_us;
|
||||
public NativeLong t_start_us;
|
||||
int t_load_us = 0;
|
||||
int t_start_us = 0;
|
||||
|
||||
/** weight type (FP32 / FP16 / QX) */
|
||||
public GgmlType wtype = GgmlType.GGML_TYPE_F16;
|
||||
GgmlType wtype = GgmlType.GGML_TYPE_F16;
|
||||
/** intermediate type (FP32 or FP16) */
|
||||
public GgmlType itype = GgmlType.GGML_TYPE_F16;
|
||||
GgmlType itype = GgmlType.GGML_TYPE_F16;
|
||||
|
||||
public WhisperContextParams.ByValue params;
|
||||
|
||||
public Pointer model;
|
||||
public Pointer vocab;
|
||||
public Pointer state;
|
||||
// WhisperModel model;
|
||||
public PointerByReference model;
|
||||
// whisper_vocab vocab;
|
||||
// whisper_state * state = nullptr;
|
||||
public PointerByReference vocab;
|
||||
public PointerByReference state;
|
||||
|
||||
/** populated by whisper_init_from_file_with_params() */
|
||||
public Pointer path_model;
|
||||
String path_model;
|
||||
WhisperContextParams params;
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return List.of("t_load_us", "t_start_us", "wtype", "itype",
|
||||
"params", "model", "vocab", "state", "path_model");
|
||||
}
|
||||
// public static class ByReference extends WhisperContext implements Structure.ByReference {
|
||||
// }
|
||||
//
|
||||
// public static class ByValue extends WhisperContext implements Structure.ByValue {
|
||||
// }
|
||||
//
|
||||
// @Override
|
||||
// protected List<String> getFieldOrder() {
|
||||
// return List.of("t_load_us", "t_start_us", "wtype", "itype", "model", "vocab", "state", "path_model");
|
||||
// }
|
||||
}
|
||||
|
@ -43,11 +43,11 @@ public class WhisperCpp implements AutoCloseable {
|
||||
* @param modelPath - absolute path, or just the name (eg: "base", "base-en" or "base.en")
|
||||
* @param params - params to use when initialising the context
|
||||
*/
|
||||
public void initContext(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
|
||||
public void initContext(String modelPath, WhisperContextParams params) throws FileNotFoundException {
|
||||
initContextImpl(modelPath, params);
|
||||
}
|
||||
|
||||
private void initContextImpl(String modelPath, WhisperContextParams.ByValue params) throws FileNotFoundException {
|
||||
private void initContextImpl(String modelPath, WhisperContextParams params) throws FileNotFoundException {
|
||||
if (ctx != null) {
|
||||
lib.whisper_free(ctx);
|
||||
}
|
||||
@ -69,13 +69,15 @@ public class WhisperCpp implements AutoCloseable {
|
||||
|
||||
/**
|
||||
* Provides default params which can be used with `whisper_init_from_file_with_params()` etc.
|
||||
* Returns a ByValue instance to ensure proper parameter passing to native code.
|
||||
* Because this function allocates memory for the params, the caller must call either:
|
||||
* - call `whisper_free_context_params()`
|
||||
* - `Native.free(Pointer.nativeValue(pointer));`
|
||||
*/
|
||||
public WhisperContextParams.ByValue getContextDefaultParams() {
|
||||
WhisperContextParams.ByValue valueParams = new WhisperContextParams.ByValue(
|
||||
lib.whisper_context_default_params_by_ref());
|
||||
valueParams.read();
|
||||
return valueParams;
|
||||
public WhisperContextParams getContextDefaultParams() {
|
||||
paramsPointer = lib.whisper_context_default_params_by_ref();
|
||||
WhisperContextParams params = new WhisperContextParams(paramsPointer);
|
||||
params.read();
|
||||
return params;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -86,7 +88,7 @@ public class WhisperCpp implements AutoCloseable {
|
||||
*
|
||||
* @param strategy - GREEDY
|
||||
*/
|
||||
public WhisperFullParams.ByValue getFullDefaultParams(WhisperSamplingStrategy strategy) {
|
||||
public WhisperFullParams getFullDefaultParams(WhisperSamplingStrategy strategy) {
|
||||
Pointer pointer;
|
||||
|
||||
// whisper_full_default_params_by_ref allocates memory which we need to delete, so only create max 1 pointer for each strategy.
|
||||
@ -102,7 +104,7 @@ public class WhisperCpp implements AutoCloseable {
|
||||
pointer = beamParamsPointer;
|
||||
}
|
||||
|
||||
WhisperFullParams.ByValue params = new WhisperFullParams.ByValue(pointer);
|
||||
WhisperFullParams params = new WhisperFullParams(pointer);
|
||||
params.read();
|
||||
return params;
|
||||
}
|
||||
@ -136,21 +138,15 @@ public class WhisperCpp implements AutoCloseable {
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text.
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text.
|
||||
* Not thread safe for same context
|
||||
* Uses the specified decoding strategy to obtain the text.
|
||||
*/
|
||||
public String fullTranscribe(WhisperFullParams.ByValue whisperParams, float[] audioData) throws IOException {
|
||||
public String fullTranscribe(WhisperFullParams whisperParams, float[] audioData) throws IOException {
|
||||
if (ctx == null) {
|
||||
throw new IllegalStateException("Model not initialised");
|
||||
}
|
||||
|
||||
/*
|
||||
WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
|
||||
lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
|
||||
valueParams.read();
|
||||
*/
|
||||
|
||||
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
|
||||
throw new IOException("Failed to process audio");
|
||||
}
|
||||
@ -167,17 +163,12 @@ public class WhisperCpp implements AutoCloseable {
|
||||
|
||||
return str.toString().trim();
|
||||
}
|
||||
|
||||
public List<WhisperSegment> fullTranscribeWithTime(WhisperFullParams whisperParams, float[] audioData) throws IOException {
|
||||
if (ctx == null) {
|
||||
throw new IllegalStateException("Model not initialised");
|
||||
}
|
||||
|
||||
WhisperFullParams.ByValue valueParams = new WhisperFullParams.ByValue(
|
||||
lib.whisper_full_default_params_by_ref(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH.ordinal()));
|
||||
valueParams.read();
|
||||
|
||||
if (lib.whisper_full(ctx, valueParams, audioData, audioData.length) != 0) {
|
||||
if (lib.whisper_full(ctx, whisperParams, audioData, audioData.length) != 0) {
|
||||
throw new IOException("Failed to process audio");
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,7 @@ public interface WhisperCppJnaLibrary extends Library {
|
||||
* @param params Pointer to whisper_context_params
|
||||
* @return Whisper context on success, null on failure
|
||||
*/
|
||||
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams.ByValue params);
|
||||
Pointer whisper_init_from_file_with_params(String path_model, WhisperContextParams params);
|
||||
|
||||
/**
|
||||
* Allocate (almost) all memory needed for the model by loading from a buffer.
|
||||
@ -180,12 +180,12 @@ public interface WhisperCppJnaLibrary extends Library {
|
||||
/**
|
||||
* @return the id of the specified language, returns -1 if not found.
|
||||
* Examples:
|
||||
* "de" -> 2
|
||||
* "german" -> 2
|
||||
* "de" -> 2
|
||||
* "german" -> 2
|
||||
*/
|
||||
int whisper_lang_id(String lang);
|
||||
|
||||
/** @return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found */
|
||||
/** @return the short string of the specified language id (e.g. 2 -> "de"), returns nullptr if not found */
|
||||
String whisper_lang_str(int id);
|
||||
|
||||
/**
|
||||
@ -268,21 +268,20 @@ public interface WhisperCppJnaLibrary extends Library {
|
||||
void whisper_free_params(Pointer params);
|
||||
|
||||
/**
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
* Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
|
||||
* Not thread safe for same context
|
||||
* Uses the specified decoding strategy to obtain the text.
|
||||
*/
|
||||
int whisper_full(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples);
|
||||
int whisper_full(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples);
|
||||
|
||||
public int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams.ByValue params, float[] samples, int n_samples);
|
||||
//int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
|
||||
int whisper_full_with_state(Pointer ctx, Pointer state, WhisperFullParams params, final float[] samples, int n_samples);
|
||||
|
||||
// Split the input audio in chunks and process each chunk separately using whisper_full_with_state()
|
||||
// Result is stored in the default state of the context
|
||||
// Not thread safe if executed in parallel on the same context.
|
||||
// It seems this approach can offer some speedup in some cases.
|
||||
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
|
||||
int whisper_full_parallel(Pointer ctx, WhisperFullParams.ByValue params, final float[] samples, int n_samples, int n_processors);
|
||||
int whisper_full_parallel(Pointer ctx, WhisperFullParams params, final float[] samples, int n_samples, int n_processors);
|
||||
|
||||
/**
|
||||
* Number of generated text segments.
|
||||
|
@ -1,17 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.callbacks;
|
||||
|
||||
import com.sun.jna.Callback;
|
||||
|
||||
/**
|
||||
* Callback for aborting GGML computation
|
||||
* Maps to the C typedef: bool (*ggml_abort_callback)(void * data)
|
||||
*/
|
||||
public interface GgmlAbortCallback extends Callback {
|
||||
/**
|
||||
* Return true to abort the computation, false to continue
|
||||
*
|
||||
* @param data User data passed to the callback
|
||||
* @return true to abort, false to continue
|
||||
*/
|
||||
boolean invoke(com.sun.jna.Pointer data);
|
||||
}
|
@ -1,30 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
import com.sun.jna.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class WhisperAhead extends Structure {
|
||||
|
||||
public int n_text_layer;
|
||||
|
||||
public int n_head;
|
||||
|
||||
public WhisperAhead() {
|
||||
super();
|
||||
}
|
||||
|
||||
public WhisperAhead(int textLayer, int head) {
|
||||
super();
|
||||
this.n_text_layer = textLayer;
|
||||
this.n_head = head;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList("n_text_layer", "n_head");
|
||||
}
|
||||
|
||||
public static class ByReference extends WhisperAhead implements Structure.ByReference {}
|
||||
|
||||
public static class ByValue extends WhisperAhead implements Structure.ByValue {}
|
||||
}
|
@ -1,41 +0,0 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
import com.sun.jna.*;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
public class WhisperAheads extends Structure {
|
||||
public NativeLong n_heads;
|
||||
|
||||
public Pointer heads;
|
||||
|
||||
public WhisperAheads() {
|
||||
super();
|
||||
}
|
||||
|
||||
/**
|
||||
* Create alignment heads from an array of WhisperAhead objects
|
||||
*/
|
||||
public void setHeads(WhisperAhead[] aheadsArray) {
|
||||
this.n_heads = new NativeLong(aheadsArray.length);
|
||||
|
||||
int structSize = aheadsArray[0].size();
|
||||
Memory mem = new Memory(structSize * aheadsArray.length);
|
||||
|
||||
for (int i = 0; i < aheadsArray.length; i++) {
|
||||
aheadsArray[i].write();
|
||||
byte[] buffer = aheadsArray[i].getPointer().getByteArray(0, structSize);
|
||||
mem.write(i * structSize, buffer, 0, buffer.length);
|
||||
}
|
||||
|
||||
this.heads = mem;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList("n_heads", "heads");
|
||||
}
|
||||
|
||||
public static class ByReference extends WhisperAheads implements Structure.ByReference {}
|
||||
|
||||
public static class ByValue extends WhisperAheads implements Structure.ByValue {}
|
||||
}
|
@ -1,5 +1,7 @@
|
||||
package io.github.ggerganov.whispercpp.params;
|
||||
|
||||
import com.sun.jna.*;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@ -9,73 +11,21 @@ import java.util.List;
|
||||
* whisper_context_default_params()
|
||||
*/
|
||||
public class WhisperContextParams extends Structure {
|
||||
|
||||
public WhisperContextParams(Pointer p) {
|
||||
super(p);
|
||||
}
|
||||
|
||||
public WhisperContextParams() {
|
||||
super();
|
||||
}
|
||||
|
||||
/** Use GPU for inference (default = true) */
|
||||
/** Use GPU for inference Number (default = true) */
|
||||
public CBool use_gpu;
|
||||
|
||||
/** Use flash attention (default = false) */
|
||||
public CBool flash_attn;
|
||||
|
||||
/** CUDA device to use (default = 0) */
|
||||
public int gpu_device;
|
||||
|
||||
/** [EXPERIMENTAL] Enable token-level timestamps with DTW (default = false) */
|
||||
public CBool dtw_token_timestamps;
|
||||
|
||||
/** [EXPERIMENTAL] Alignment heads preset for DTW */
|
||||
public int dtw_aheads_preset;
|
||||
|
||||
/** Number of top layers to use for DTW when using WHISPER_AHEADS_N_TOP_MOST preset */
|
||||
public int dtw_n_top;
|
||||
|
||||
public WhisperAheads.ByValue dtw_aheads;
|
||||
|
||||
/** DTW memory size (internal use) */
|
||||
public NativeLong dtw_mem_size;
|
||||
|
||||
/** Use GPU for inference */
|
||||
/** Use GPU for inference Number (default = true) */
|
||||
public void useGpu(boolean enable) {
|
||||
use_gpu = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Use flash attention */
|
||||
public void useFlashAttn(boolean enable) {
|
||||
flash_attn = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Enable DTW token-level timestamps */
|
||||
public void enableDtwTokenTimestamps(boolean enable) {
|
||||
dtw_token_timestamps = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Set DTW alignment heads preset */
|
||||
public void setDtwAheadsPreset(int preset) {
|
||||
dtw_aheads_preset = preset;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList(
|
||||
"use_gpu",
|
||||
"flash_attn",
|
||||
"gpu_device",
|
||||
"dtw_token_timestamps",
|
||||
"dtw_aheads_preset",
|
||||
"dtw_n_top",
|
||||
"dtw_aheads",
|
||||
"dtw_mem_size"
|
||||
);
|
||||
}
|
||||
|
||||
public static class ByValue extends WhisperContextParams implements Structure.ByValue {
|
||||
public ByValue() { super(); }
|
||||
public ByValue(Pointer p) { super(p); }
|
||||
return Arrays.asList("use_gpu");
|
||||
}
|
||||
}
|
||||
|
@ -5,7 +5,6 @@ import io.github.ggerganov.whispercpp.callbacks.WhisperEncoderBeginCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.WhisperLogitsFilterCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.WhisperNewSegmentCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.WhisperProgressCallback;
|
||||
import io.github.ggerganov.whispercpp.callbacks.GgmlAbortCallback;
|
||||
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
@ -17,12 +16,10 @@ import java.util.List;
|
||||
*/
|
||||
public class WhisperFullParams extends Structure {
|
||||
|
||||
public WhisperFullParams() {
|
||||
super();
|
||||
}
|
||||
|
||||
public WhisperFullParams(Pointer p) {
|
||||
super(p);
|
||||
// super(p, ALIGN_MSVC);
|
||||
// super(p, ALIGN_GNUC);
|
||||
}
|
||||
|
||||
/** Sampling strategy for whisper_full() function. */
|
||||
@ -72,10 +69,10 @@ public class WhisperFullParams extends Structure {
|
||||
single_segment = single ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
||||
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
||||
public CBool print_special;
|
||||
|
||||
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
||||
/** Flag to print special tokens (e.g., <SOT>, <EOT>, <BEG>, etc.). (default = false) */
|
||||
public void printSpecial(boolean enable) {
|
||||
print_special = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
@ -132,14 +129,6 @@ public class WhisperFullParams extends Structure {
|
||||
/** Maximum tokens per segment (0, default = no limit) */
|
||||
public int max_tokens;
|
||||
|
||||
/** [EXPERIMENTAL] Enable debug mode for extra info */
|
||||
public CBool debug_mode;
|
||||
|
||||
/** Enable debug mode */
|
||||
public void enableDebugMode(boolean enable) {
|
||||
debug_mode = enable ? CBool.TRUE : CBool.FALSE;
|
||||
}
|
||||
|
||||
/** Overwrite the audio context size (0 = use default). */
|
||||
public int audio_ctx;
|
||||
|
||||
@ -285,16 +274,6 @@ public class WhisperFullParams extends Structure {
|
||||
*/
|
||||
public Pointer encoder_begin_callback_user_data;
|
||||
|
||||
/** Callback used to abort GGML computation */
|
||||
public Pointer abort_callback;
|
||||
|
||||
/** User data for the abort_callback */
|
||||
public Pointer abort_callback_user_data;
|
||||
|
||||
public void setAbortCallback(GgmlAbortCallback callback) {
|
||||
abort_callback = CallbackReference.getFunctionPointer(callback);
|
||||
}
|
||||
|
||||
/**
|
||||
* Callback by each decoder to filter obtained logits.
|
||||
* WhisperLogitsFilterCallback
|
||||
@ -331,28 +310,17 @@ public class WhisperFullParams extends Structure {
|
||||
|
||||
@Override
|
||||
protected List<String> getFieldOrder() {
|
||||
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx",
|
||||
"offset_ms", "duration_ms", "translate", "no_context",
|
||||
"no_timestamps", "single_segment", "print_special",
|
||||
"print_progress", "print_realtime", "print_timestamps",
|
||||
"token_timestamps", "thold_pt", "thold_ptsum", "max_len",
|
||||
"split_on_word", "max_tokens", "debug_mode", "audio_ctx",
|
||||
"tdrz_enable", "suppress_regex", "initial_prompt",
|
||||
"prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
||||
"suppress_blank", "suppress_nst", "temperature",
|
||||
"max_initial_ts", "length_penalty", "temperature_inc",
|
||||
"entropy_thold", "logprob_thold", "no_speech_thold", "greedy",
|
||||
"beam_search", "new_segment_callback", "new_segment_callback_user_data",
|
||||
return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",
|
||||
"no_context", "single_segment", "no_timestamps",
|
||||
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",
|
||||
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",
|
||||
"tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",
|
||||
"suppress_blank", "suppress_nst", "temperature", "max_initial_ts", "length_penalty",
|
||||
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",
|
||||
"new_segment_callback", "new_segment_callback_user_data",
|
||||
"progress_callback", "progress_callback_user_data",
|
||||
"encoder_begin_callback", "encoder_begin_callback_user_data",
|
||||
"abort_callback", "abort_callback_user_data",
|
||||
"logits_filter_callback", "logits_filter_callback_user_data",
|
||||
"grammar_rules", "n_grammar_rules", "i_start_rule", "grammar_penalty");
|
||||
}
|
||||
|
||||
public static class ByValue extends WhisperFullParams implements Structure.ByValue {
|
||||
public ByValue() { super(); }
|
||||
public ByValue(Pointer p) { super(p); }
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -76,7 +76,7 @@ class WhisperCppTest {
|
||||
float[] floats = new float[b.length / 2];
|
||||
|
||||
//WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_GREEDY);
|
||||
WhisperFullParams.ByValue params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
WhisperFullParams params = whisper.getFullDefaultParams(WhisperSamplingStrategy.WHISPER_SAMPLING_BEAM_SEARCH);
|
||||
params.setProgressCallback((ctx, state, progress, user_data) -> System.out.println("progress: " + progress));
|
||||
params.print_progress = CBool.FALSE;
|
||||
//params.initial_prompt = "and so my fellow Americans um, like";
|
||||
|
@ -33,9 +33,6 @@ mkdir build-em && cd build-em
|
||||
emcmake cmake .. && make -j
|
||||
|
||||
# run test
|
||||
node ../tests/test-whisper.js
|
||||
|
||||
# For Node.js versions prior to v16.4.0, experimental features need to be enabled:
|
||||
node --experimental-wasm-threads --experimental-wasm-simd ../tests/test-whisper.js
|
||||
|
||||
# publish npm package
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "whisper.cpp",
|
||||
"version": "1.7.5",
|
||||
"version": "1.7.4",
|
||||
"description": "Whisper speech recognition",
|
||||
"main": "whisper.js",
|
||||
"scripts": {
|
||||
|
@ -228,7 +228,7 @@ The second argument `samples` may be an array, an object with `length` and `each
|
||||
Development
|
||||
-----------
|
||||
|
||||
% git clone https://github.com/ggml-org/whisper.cpp.git
|
||||
% git clone https://github.com/ggerganov/whisper.cpp.git
|
||||
% cd whisper.cpp/bindings/ruby
|
||||
% rake test
|
||||
|
||||
@ -241,5 +241,5 @@ License
|
||||
|
||||
The same to [whisper.cpp][].
|
||||
|
||||
[whisper.cpp]: https://github.com/ggml-org/whisper.cpp
|
||||
[models]: https://github.com/ggml-org/whisper.cpp/tree/master/models
|
||||
[whisper.cpp]: https://github.com/ggerganov/whisper.cpp
|
||||
[models]: https://github.com/ggerganov/whisper.cpp/tree/master/models
|
||||
|
@ -1,9 +1,5 @@
|
||||
ggml/src/ggml-cpu/ggml-cpu-cpp.o: \
|
||||
ggml/src/ggml-cpu/ggml-cpu.cpp \
|
||||
ggml/src/ggml-cpu/unary-ops.cpp \
|
||||
ggml/src/ggml-cpu/binary-ops.cpp \
|
||||
ggml/src/ggml-cpu/vec.cpp \
|
||||
ggml/src/ggml-cpu/ops.cpp \
|
||||
ggml/include/ggml-backend.h \
|
||||
ggml/include/ggml.h \
|
||||
ggml/include/ggml-alloc.h \
|
||||
|
@ -168,11 +168,7 @@ $OBJ_GGML <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-aarch64.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-hbm.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-quants.o' <<
|
||||
'ggml/src/ggml-cpu/ggml-cpu-traits.o' <<
|
||||
'ggml/src/ggml-cpu/unary-ops.o' <<
|
||||
'ggml/src/ggml-cpu/binary-ops.o' <<
|
||||
'ggml/src/ggml-cpu/vec.o' <<
|
||||
'ggml/src/ggml-cpu/ops.o'
|
||||
'ggml/src/ggml-cpu/ggml-cpu-traits.o'
|
||||
|
||||
$OBJ_WHISPER <<
|
||||
'src/whisper.o' <<
|
||||
|
@ -918,7 +918,7 @@ ruby_whisper_params_initialize(int argc, VALUE *argv, VALUE self)
|
||||
return self;
|
||||
}
|
||||
|
||||
rb_get_kwargs(kw_hash, param_names, 0, RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT, values);
|
||||
rb_get_kwargs(kw_hash, ¶m_names, 0, RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT, &values);
|
||||
Data_Get_Struct(self, ruby_whisper_params, rwp);
|
||||
|
||||
for (i = 0; i < RUBY_WHISPER_PARAMS_PARAM_NAMES_COUNT; i++) {
|
||||
|
@ -34,7 +34,7 @@ module Whisper
|
||||
when /darwin/
|
||||
Pathname(Dir.home)/"Library/Caches"
|
||||
else
|
||||
ENV.key?("XDG_CACHE_HOME") ? Pathname(ENV["XDG_CACHE_HOME"]) : Pathname(Dir.home)/".cache"
|
||||
ENV.key?("XDG_CACHE_HOME") ? ENV["XDG_CACHE_HOME"] : Pathname(Dir.home)/".cache"
|
||||
end
|
||||
base/"whisper.cpp"
|
||||
end
|
||||
|
@ -25,7 +25,7 @@ class TestCallback < TestBase
|
||||
assert start_time >= 0
|
||||
assert_kind_of Integer, end_time
|
||||
assert end_time > 0
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, text) if i_segment == 0
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, text if i_segment == 0
|
||||
end
|
||||
}
|
||||
|
||||
@ -145,9 +145,9 @@ class TestCallback < TestBase
|
||||
|
||||
def test_abort_on
|
||||
do_abort = false
|
||||
_aborted_from_callback = false
|
||||
aborted_from_callback = false
|
||||
@params.on_new_segment do |segment|
|
||||
do_abort = true if segment.text.match?(/ask/)
|
||||
do_abort = true if segment.text.match? /ask/
|
||||
end
|
||||
i = 0
|
||||
@params.abort_on do
|
||||
|
@ -4,7 +4,7 @@ class TestError < TestBase
|
||||
def test_error
|
||||
error = Whisper::Error.new(-2)
|
||||
assert_equal "failed to compute log mel spectrogram", error.message
|
||||
assert_equal(-2, error.code)
|
||||
assert_equal -2, error.code
|
||||
end
|
||||
|
||||
def test_unknown_error
|
||||
@ -14,7 +14,7 @@ class TestError < TestBase
|
||||
|
||||
def test_non_int_code
|
||||
assert_raise TypeError do
|
||||
_error = Whisper::Error.new("non int")
|
||||
error = Whisper::Error.new("non int")
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -162,7 +162,7 @@ class TestParams < TestBase
|
||||
end
|
||||
|
||||
def test_length_penalty
|
||||
assert_equal(-1.0, @params.length_penalty)
|
||||
assert_equal -1.0, @params.length_penalty
|
||||
@params.length_penalty = 0.5
|
||||
assert_equal 0.5, @params.length_penalty
|
||||
end
|
||||
@ -180,9 +180,9 @@ class TestParams < TestBase
|
||||
end
|
||||
|
||||
def test_logprob_thold
|
||||
assert_in_delta(-1.0, @params.logprob_thold)
|
||||
assert_in_delta -1.0, @params.logprob_thold
|
||||
@params.logprob_thold = -0.5
|
||||
assert_in_delta(-0.5, @params.logprob_thold)
|
||||
assert_in_delta -0.5, @params.logprob_thold
|
||||
end
|
||||
|
||||
def test_no_speech_thold
|
||||
|
@ -49,13 +49,13 @@ class TestSegment < TestBase
|
||||
if index == 0
|
||||
seg = segment
|
||||
assert_equal 0, segment.start_time
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
|
||||
end
|
||||
index += 1
|
||||
end
|
||||
whisper.transcribe(AUDIO, params)
|
||||
assert_equal 0, seg.start_time
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, seg.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, seg.text
|
||||
end
|
||||
|
||||
def test_on_new_segment_twice
|
||||
|
@ -16,7 +16,7 @@ class TestWhisper < TestBase
|
||||
params.print_timestamps = false
|
||||
|
||||
@whisper.transcribe(AUDIO, params) {|text|
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, text
|
||||
}
|
||||
end
|
||||
|
||||
@ -32,7 +32,7 @@ class TestWhisper < TestBase
|
||||
def test_full_get_segment
|
||||
segment = whisper.full_get_segment(0)
|
||||
assert_equal 0, segment.start_time
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, segment.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, segment.text
|
||||
end
|
||||
|
||||
def test_full_get_segment_t0
|
||||
@ -59,7 +59,7 @@ class TestWhisper < TestBase
|
||||
end
|
||||
|
||||
def test_full_get_segment_text
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0))
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, whisper.full_get_segment_text(0)
|
||||
end
|
||||
|
||||
def test_full_get_segment_no_speech_prob
|
||||
@ -134,14 +134,14 @@ class TestWhisper < TestBase
|
||||
@whisper.full(@params, @samples, @samples.length)
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
|
||||
end
|
||||
|
||||
def test_full_without_length
|
||||
@whisper.full(@params, @samples)
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
|
||||
end
|
||||
|
||||
def test_full_enumerator
|
||||
@ -149,7 +149,7 @@ class TestWhisper < TestBase
|
||||
@whisper.full(@params, samples, @samples.length)
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
|
||||
end
|
||||
|
||||
def test_full_enumerator_without_length
|
||||
@ -171,28 +171,26 @@ class TestWhisper < TestBase
|
||||
@whisper.full(@params, samples)
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
assert_match(/ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text)
|
||||
assert_match /ask not what your country can do for you, ask what you can do for your country/, @whisper.each_segment.first.text
|
||||
end
|
||||
|
||||
def test_full_parallel
|
||||
nprocessors = 2
|
||||
@whisper.full_parallel(@params, @samples, @samples.length, nprocessors)
|
||||
@whisper.full_parallel(@params, @samples, @samples.length, Etc.nprocessors)
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
assert_equal Etc.nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
assert_match(/ask what you can do/i, text)
|
||||
assert_match(/for your country/i, text)
|
||||
assert_match /ask what you can do/i, text
|
||||
assert_match /for your country/i, text
|
||||
end
|
||||
|
||||
def test_full_parallel_with_memory_view
|
||||
nprocessors = 2
|
||||
samples = JFKReader.new(AUDIO)
|
||||
@whisper.full_parallel(@params, samples, nil, nprocessors)
|
||||
@whisper.full_parallel(@params, samples, nil, Etc.nprocessors)
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
assert_equal Etc.nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
assert_match(/ask what you can do/i, text)
|
||||
assert_match(/for your country/i, text)
|
||||
assert_match /ask what you can do/i, text
|
||||
assert_match /for your country/i, text
|
||||
end
|
||||
|
||||
def test_full_parallel_without_length_and_n_processors
|
||||
@ -200,18 +198,17 @@ class TestWhisper < TestBase
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
assert_match(/ask what you can do/i, text)
|
||||
assert_match(/for your country/i, text)
|
||||
assert_match /ask what you can do/i, text
|
||||
assert_match /for your country/i, text
|
||||
end
|
||||
|
||||
def test_full_parallel_without_length
|
||||
nprocessors = 2
|
||||
@whisper.full_parallel(@params, @samples, nil, nprocessors)
|
||||
@whisper.full_parallel(@params, @samples, nil, Etc.nprocessors)
|
||||
|
||||
assert_equal nprocessors, @whisper.full_n_segments
|
||||
assert_equal Etc.nprocessors, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
assert_match(/ask what you can do/i, text)
|
||||
assert_match(/for your country/i, text)
|
||||
assert_match /ask what you can do/i, text
|
||||
assert_match /for your country/i, text
|
||||
end
|
||||
|
||||
def test_full_parallel_without_n_processors
|
||||
@ -219,8 +216,8 @@ class TestWhisper < TestBase
|
||||
|
||||
assert_equal 1, @whisper.full_n_segments
|
||||
text = @whisper.each_segment.collect(&:text).join
|
||||
assert_match(/ask what you can do/i, text)
|
||||
assert_match(/for your country/i, text)
|
||||
assert_match /ask what you can do/i, text
|
||||
assert_match /for your country/i, text
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -26,7 +26,7 @@ Gem::Specification.new do |s|
|
||||
s.required_ruby_version = '>= 3.1.0'
|
||||
|
||||
#### Documentation and testing.
|
||||
s.homepage = 'https://github.com/ggml-org/whisper.cpp'
|
||||
s.homepage = 'https://github.com/ggerganov/whisper.cpp'
|
||||
s.rdoc_options = ['--main', 'README.md']
|
||||
|
||||
|
||||
|
@ -41,11 +41,6 @@ COMMON_CMAKE_ARGS=(
|
||||
-DGGML_OPENMP=${GGML_OPENMP}
|
||||
)
|
||||
|
||||
XCODE_VERSION=$(xcodebuild -version 2>/dev/null | head -n1 | awk '{ print $2 }')
|
||||
MAJOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f1)
|
||||
MINOR_VERSION=$(echo $XCODE_VERSION | cut -d. -f2)
|
||||
echo "Detected Xcode version: $XCODE_VERSION"
|
||||
|
||||
check_required_tool() {
|
||||
local tool=$1
|
||||
local install_message=$2
|
||||
@ -113,7 +108,7 @@ setup_framework_structure() {
|
||||
fi
|
||||
|
||||
# Copy all required headers (common for all platforms)
|
||||
cp include/whisper.h ${header_path}
|
||||
cp include/whisper.h ${header_path}
|
||||
cp ggml/include/ggml.h ${header_path}
|
||||
cp ggml/include/ggml-alloc.h ${header_path}
|
||||
cp ggml/include/ggml-backend.h ${header_path}
|
||||
@ -250,16 +245,9 @@ combine_static_libraries() {
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-metal/${release_dir}/libggml-metal.a"
|
||||
"${base_dir}/${build_dir}/ggml/src/ggml-blas/${release_dir}/libggml-blas.a"
|
||||
)
|
||||
if [[ "$platform" == "macos" || "$platform" == "ios" ]]; then
|
||||
echo "Adding libwhisper.coreml library to the build."
|
||||
libs+=(
|
||||
"${base_dir}/${build_dir}/src/${release_dir}/libwhisper.coreml.a"
|
||||
)
|
||||
fi
|
||||
|
||||
# Create temporary directory for processing
|
||||
local temp_dir="${base_dir}/${build_dir}/temp"
|
||||
echo "Creating temporary directory: ${temp_dir}"
|
||||
mkdir -p "${temp_dir}"
|
||||
|
||||
# Since we have multiple architectures libtool will find object files that do not
|
||||
@ -271,7 +259,6 @@ combine_static_libraries() {
|
||||
local archs=""
|
||||
local min_version_flag=""
|
||||
local install_name=""
|
||||
local frameworks="-framework Foundation -framework Metal -framework Accelerate"
|
||||
|
||||
case "$platform" in
|
||||
"ios")
|
||||
@ -285,14 +272,12 @@ combine_static_libraries() {
|
||||
min_version_flag="-mios-version-min=${IOS_MIN_OS_VERSION}"
|
||||
fi
|
||||
install_name="@rpath/whisper.framework/whisper"
|
||||
frameworks+=" -framework CoreML"
|
||||
;;
|
||||
"macos")
|
||||
sdk="macosx"
|
||||
archs="arm64 x86_64"
|
||||
min_version_flag="-mmacosx-version-min=${MACOS_MIN_OS_VERSION}"
|
||||
install_name="@rpath/whisper.framework/Versions/Current/whisper"
|
||||
frameworks+=" -framework CoreML"
|
||||
;;
|
||||
"visionos")
|
||||
if [[ "$is_simulator" == "true" ]]; then
|
||||
@ -334,34 +319,27 @@ combine_static_libraries() {
|
||||
$arch_flags \
|
||||
$min_version_flag \
|
||||
-Wl,-force_load,"${temp_dir}/combined.a" \
|
||||
$frameworks \
|
||||
-framework Foundation -framework Metal -framework Accelerate \
|
||||
-install_name "$install_name" \
|
||||
-o "${base_dir}/${output_lib}"
|
||||
|
||||
# Platform-specific post-processing for device builds
|
||||
if [[ "$is_simulator" == "false" ]]; then
|
||||
if command -v xcrun vtool &>/dev/null; then
|
||||
if command -v vtool &>/dev/null; then
|
||||
case "$platform" in
|
||||
"ios")
|
||||
echo "Marking binary as a framework binary for iOS..."
|
||||
xcrun vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
|
||||
vtool -set-build-version ios ${IOS_MIN_OS_VERSION} ${IOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"visionos")
|
||||
echo "Marking binary as a framework binary for visionOS..."
|
||||
if [[ "$MAJOR_VERSION" -gt 16 ]] || [[ "$MAJOR_VERSION" -eq 16 && "$MINOR_VERSION" -gt 2 ]]; then
|
||||
echo "Xcode version greater than 16.2, using visionOS."
|
||||
VISION_OS_BUILD_VERSION="visionos"
|
||||
else
|
||||
echo "Xcode version less than or equal to 16.2, using xros."
|
||||
VISION_OS_BUILD_VERSION="xros"
|
||||
fi
|
||||
xcrun vtool -set-build-version ${VISION_OS_BUILD_VERSION} ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
|
||||
vtool -set-build-version xros ${VISIONOS_MIN_OS_VERSION} ${VISIONOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
"tvos")
|
||||
echo "Marking binary as a framework binary for tvOS..."
|
||||
xcrun vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
|
||||
vtool -set-build-version tvos ${TVOS_MIN_OS_VERSION} ${TVOS_MIN_OS_VERSION} -replace \
|
||||
-output "${base_dir}/${output_lib}" "${base_dir}/${output_lib}"
|
||||
;;
|
||||
esac
|
||||
@ -421,8 +399,6 @@ cmake -B build-ios-sim -G Xcode \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphonesimulator \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DWHISPER_COREML="ON" \
|
||||
-DWHISPER_COREML_ALLOW_FALLBACK="ON" \
|
||||
-S .
|
||||
cmake --build build-ios-sim --config Release -- -quiet
|
||||
|
||||
@ -435,8 +411,6 @@ cmake -B build-ios-device -G Xcode \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=iphoneos \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DWHISPER_COREML="ON" \
|
||||
-DWHISPER_COREML_ALLOW_FALLBACK="ON" \
|
||||
-S .
|
||||
cmake --build build-ios-device --config Release -- -quiet
|
||||
|
||||
@ -447,8 +421,6 @@ cmake -B build-macos -G Xcode \
|
||||
-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64" \
|
||||
-DCMAKE_C_FLAGS="${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="${COMMON_CXX_FLAGS}" \
|
||||
-DWHISPER_COREML="ON" \
|
||||
-DWHISPER_COREML_ALLOW_FALLBACK="ON" \
|
||||
-S .
|
||||
cmake --build build-macos --config Release -- -quiet
|
||||
|
||||
@ -460,8 +432,8 @@ cmake -B build-visionos -G Xcode \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xros \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xros \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-visionos --config Release -- -quiet
|
||||
|
||||
@ -473,8 +445,8 @@ cmake -B build-visionos-sim -G Xcode \
|
||||
-DCMAKE_SYSTEM_NAME=visionOS \
|
||||
-DCMAKE_OSX_SYSROOT=xrsimulator \
|
||||
-DCMAKE_XCODE_ATTRIBUTE_SUPPORTED_PLATFORMS=xrsimulator \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 ${COMMON_CXX_FLAGS}" \
|
||||
-DCMAKE_C_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_C_FLAGS}" \
|
||||
-DCMAKE_CXX_FLAGS="-D_XOPEN_SOURCE=700 -Du_int=unsigned\ int -Du_char=unsigned\ char -Du_short=unsigned\ short ${COMMON_CXX_FLAGS}" \
|
||||
-S .
|
||||
cmake --build build-visionos-sim --config Release -- -quiet
|
||||
|
||||
|
@ -10,8 +10,6 @@
|
||||
# # with CUDA support
|
||||
# GG_BUILD_CUDA=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
#
|
||||
# # with SYCL support
|
||||
# GG_BUILD_SYCL=1 bash ./ci/run.sh ./tmp/results ./tmp/mnt
|
||||
|
||||
if [ -z "$2" ]; then
|
||||
echo "usage: $0 <output-dir> <mnt-dir>"
|
||||
@ -326,9 +324,8 @@ ret=0
|
||||
for model in "${MODELS[@]}"; do
|
||||
test $ret -eq 0 && gg_download_model ${model}
|
||||
done
|
||||
if [ -z ${GG_BUILD_SYCL}]; then
|
||||
test $ret -eq 0 && gg_run ctest debug
|
||||
fi
|
||||
|
||||
test $ret -eq 0 && gg_run ctest debug
|
||||
test $ret -eq 0 && gg_run ctest release
|
||||
|
||||
test $ret -eq 0 && gg_run bench
|
||||
|
@ -18,7 +18,6 @@ const whisperParamsMock = {
|
||||
translate: true,
|
||||
no_timestamps: false,
|
||||
audio_ctx: 0,
|
||||
max_len: 0,
|
||||
};
|
||||
|
||||
describe("Run whisper.node", () => {
|
||||
|
@ -128,227 +128,192 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper
|
||||
|
||||
void cb_log_disable(enum ggml_log_level, const char *, void *) {}
|
||||
|
||||
class ProgressWorker : public Napi::AsyncWorker {
|
||||
public:
|
||||
ProgressWorker(Napi::Function& callback, whisper_params params, Napi::Function progress_callback, Napi::Env env)
|
||||
: Napi::AsyncWorker(callback), params(params), env(env) {
|
||||
// Create thread-safe function
|
||||
if (!progress_callback.IsEmpty()) {
|
||||
tsfn = Napi::ThreadSafeFunction::New(
|
||||
env,
|
||||
progress_callback,
|
||||
"Progress Callback",
|
||||
0,
|
||||
1
|
||||
);
|
||||
}
|
||||
int run(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
||||
if (params.no_prints) {
|
||||
whisper_log_set(cb_log_disable, NULL);
|
||||
}
|
||||
|
||||
~ProgressWorker() {
|
||||
if (tsfn) {
|
||||
// Make sure to release the thread-safe function on destruction
|
||||
tsfn.Release();
|
||||
}
|
||||
if (params.fname_inp.empty() && params.pcmf32.empty()) {
|
||||
fprintf(stderr, "error: no input files or audio buffer specified\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
void Execute() override {
|
||||
// Use custom run function with progress callback support
|
||||
run_with_progress(params, result);
|
||||
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
exit(0);
|
||||
}
|
||||
|
||||
void OnOK() override {
|
||||
Napi::HandleScope scope(Env());
|
||||
Napi::Object res = Napi::Array::New(Env(), result.size());
|
||||
for (uint64_t i = 0; i < result.size(); ++i) {
|
||||
Napi::Object tmp = Napi::Array::New(Env(), 3);
|
||||
for (uint64_t j = 0; j < 3; ++j) {
|
||||
tmp[j] = Napi::String::New(Env(), result[i][j]);
|
||||
// whisper init
|
||||
|
||||
struct whisper_context_params cparams = whisper_context_default_params();
|
||||
cparams.use_gpu = params.use_gpu;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
||||
|
||||
if (ctx == nullptr) {
|
||||
fprintf(stderr, "error: failed to initialize whisper context\n");
|
||||
return 3;
|
||||
}
|
||||
|
||||
// if params.pcmf32 is provided, set params.fname_inp to "buffer"
|
||||
// this is simpler than further modifications in the code
|
||||
if (!params.pcmf32.empty()) {
|
||||
fprintf(stderr, "info: using audio buffer as input\n");
|
||||
params.fname_inp.clear();
|
||||
params.fname_inp.emplace_back("buffer");
|
||||
}
|
||||
|
||||
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
||||
const auto fname_inp = params.fname_inp[f];
|
||||
const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
||||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||
|
||||
// read the input audio file if params.pcmf32 is not provided
|
||||
if (params.pcmf32.empty()) {
|
||||
if (!::read_audio_data(fname_inp, pcmf32, pcmf32s, params.diarize)) {
|
||||
fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
pcmf32 = params.pcmf32;
|
||||
}
|
||||
|
||||
// print system information
|
||||
if (!params.no_prints) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
||||
params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
|
||||
}
|
||||
|
||||
// print some info about the processing
|
||||
if (!params.no_prints) {
|
||||
fprintf(stderr, "\n");
|
||||
if (!whisper_is_multilingual(ctx)) {
|
||||
if (params.language != "en" || params.translate) {
|
||||
params.language = "en";
|
||||
params.translate = false;
|
||||
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n",
|
||||
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
||||
params.n_threads, params.n_processors,
|
||||
params.language.c_str(),
|
||||
params.translate ? "translate" : "transcribe",
|
||||
params.no_timestamps ? 0 : 1,
|
||||
params.audio_ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
// run the inference
|
||||
{
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
|
||||
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_progress = params.print_progress;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.translate = params.translate;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
||||
wparams.offset_ms = params.offset_t_ms;
|
||||
wparams.duration_ms = params.duration_ms;
|
||||
|
||||
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
||||
wparams.thold_pt = params.word_thold;
|
||||
wparams.entropy_thold = params.entropy_thold;
|
||||
wparams.logprob_thold = params.logprob_thold;
|
||||
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
|
||||
wparams.greedy.best_of = params.best_of;
|
||||
wparams.beam_search.beam_size = params.beam_size;
|
||||
|
||||
wparams.initial_prompt = params.prompt.c_str();
|
||||
|
||||
wparams.no_timestamps = params.no_timestamps;
|
||||
|
||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s };
|
||||
|
||||
// this callback is called on each new segment
|
||||
if (!wparams.print_realtime) {
|
||||
wparams.new_segment_callback = whisper_print_segment_callback;
|
||||
wparams.new_segment_callback_user_data = &user_data;
|
||||
}
|
||||
|
||||
// example for abort mechanism
|
||||
// in this example, we do not abort the processing, but we could if the flag is set to true
|
||||
// the callback is called before every encoder run - if it returns false, the processing is aborted
|
||||
{
|
||||
static bool is_aborted = false; // NOTE: this should be atomic to avoid data race
|
||||
|
||||
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
||||
bool is_aborted = *(bool*)user_data;
|
||||
return !is_aborted;
|
||||
};
|
||||
wparams.encoder_begin_callback_user_data = &is_aborted;
|
||||
}
|
||||
|
||||
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
||||
fprintf(stderr, "failed to process audio\n");
|
||||
return 10;
|
||||
}
|
||||
res[i] = tmp;
|
||||
}
|
||||
Callback().Call({Env().Null(), res});
|
||||
}
|
||||
|
||||
// Progress callback function - using thread-safe function
|
||||
void OnProgress(int progress) {
|
||||
if (tsfn) {
|
||||
// Use thread-safe function to call JavaScript callback
|
||||
auto callback = [progress](Napi::Env env, Napi::Function jsCallback) {
|
||||
jsCallback.Call({Napi::Number::New(env, progress)});
|
||||
};
|
||||
|
||||
tsfn.BlockingCall(callback);
|
||||
}
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
result.resize(n_segments);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
result[i].emplace_back(to_timestamp(t0, params.comma_in_time));
|
||||
result[i].emplace_back(to_timestamp(t1, params.comma_in_time));
|
||||
result[i].emplace_back(text);
|
||||
}
|
||||
|
||||
whisper_print_timings(ctx);
|
||||
whisper_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
class Worker : public Napi::AsyncWorker {
|
||||
public:
|
||||
Worker(Napi::Function& callback, whisper_params params)
|
||||
: Napi::AsyncWorker(callback), params(params) {}
|
||||
|
||||
void Execute() override {
|
||||
run(params, result);
|
||||
}
|
||||
|
||||
void OnOK() override {
|
||||
Napi::HandleScope scope(Env());
|
||||
Napi::Object res = Napi::Array::New(Env(), result.size());
|
||||
for (uint64_t i = 0; i < result.size(); ++i) {
|
||||
Napi::Object tmp = Napi::Array::New(Env(), 3);
|
||||
for (uint64_t j = 0; j < 3; ++j) {
|
||||
tmp[j] = Napi::String::New(Env(), result[i][j]);
|
||||
}
|
||||
res[i] = tmp;
|
||||
}
|
||||
Callback().Call({Env().Null(), res});
|
||||
}
|
||||
|
||||
private:
|
||||
whisper_params params;
|
||||
std::vector<std::vector<std::string>> result;
|
||||
Napi::Env env;
|
||||
Napi::ThreadSafeFunction tsfn;
|
||||
|
||||
// Custom run function with progress callback support
|
||||
int run_with_progress(whisper_params ¶ms, std::vector<std::vector<std::string>> &result) {
|
||||
if (params.no_prints) {
|
||||
whisper_log_set(cb_log_disable, NULL);
|
||||
}
|
||||
|
||||
if (params.fname_inp.empty() && params.pcmf32.empty()) {
|
||||
fprintf(stderr, "error: no input files or audio buffer specified\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (params.language != "auto" && whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// whisper init
|
||||
struct whisper_context_params cparams = whisper_context_default_params();
|
||||
cparams.use_gpu = params.use_gpu;
|
||||
cparams.flash_attn = params.flash_attn;
|
||||
struct whisper_context * ctx = whisper_init_from_file_with_params(params.model.c_str(), cparams);
|
||||
|
||||
if (ctx == nullptr) {
|
||||
fprintf(stderr, "error: failed to initialize whisper context\n");
|
||||
return 3;
|
||||
}
|
||||
|
||||
// If params.pcmf32 provides, set params.fname_inp as "buffer"
|
||||
if (!params.pcmf32.empty()) {
|
||||
fprintf(stderr, "info: using audio buffer as input\n");
|
||||
params.fname_inp.clear();
|
||||
params.fname_inp.emplace_back("buffer");
|
||||
}
|
||||
|
||||
for (int f = 0; f < (int) params.fname_inp.size(); ++f) {
|
||||
const auto fname_inp = params.fname_inp[f];
|
||||
const auto fname_out = f < (int)params.fname_out.size() && !params.fname_out[f].empty() ? params.fname_out[f] : params.fname_inp[f];
|
||||
|
||||
std::vector<float> pcmf32; // mono-channel F32 PCM
|
||||
std::vector<std::vector<float>> pcmf32s; // stereo-channel F32 PCM
|
||||
|
||||
// If params.pcmf32 is empty, read input audio file
|
||||
if (params.pcmf32.empty()) {
|
||||
if (!::read_audio_data(fname_inp, pcmf32, pcmf32s, params.diarize)) {
|
||||
fprintf(stderr, "error: failed to read audio file '%s'\n", fname_inp.c_str());
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
pcmf32 = params.pcmf32;
|
||||
}
|
||||
|
||||
// Print system info
|
||||
if (!params.no_prints) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "system_info: n_threads = %d / %d | %s\n",
|
||||
params.n_threads*params.n_processors, std::thread::hardware_concurrency(), whisper_print_system_info());
|
||||
}
|
||||
|
||||
// Print processing info
|
||||
if (!params.no_prints) {
|
||||
fprintf(stderr, "\n");
|
||||
if (!whisper_is_multilingual(ctx)) {
|
||||
if (params.language != "en" || params.translate) {
|
||||
params.language = "en";
|
||||
params.translate = false;
|
||||
fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__);
|
||||
}
|
||||
}
|
||||
fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d, audio_ctx = %d ...\n",
|
||||
__func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE,
|
||||
params.n_threads, params.n_processors,
|
||||
params.language.c_str(),
|
||||
params.translate ? "translate" : "transcribe",
|
||||
params.no_timestamps ? 0 : 1,
|
||||
params.audio_ctx);
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
// Run inference
|
||||
{
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.strategy = params.beam_size > 1 ? WHISPER_SAMPLING_BEAM_SEARCH : WHISPER_SAMPLING_GREEDY;
|
||||
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_progress = params.print_progress;
|
||||
wparams.print_timestamps = !params.no_timestamps;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.translate = params.translate;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
|
||||
wparams.offset_ms = params.offset_t_ms;
|
||||
wparams.duration_ms = params.duration_ms;
|
||||
|
||||
wparams.token_timestamps = params.output_wts || params.max_len > 0;
|
||||
wparams.thold_pt = params.word_thold;
|
||||
wparams.entropy_thold = params.entropy_thold;
|
||||
wparams.logprob_thold = params.logprob_thold;
|
||||
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
|
||||
wparams.greedy.best_of = params.best_of;
|
||||
wparams.beam_search.beam_size = params.beam_size;
|
||||
|
||||
wparams.initial_prompt = params.prompt.c_str();
|
||||
|
||||
wparams.no_timestamps = params.no_timestamps;
|
||||
|
||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s };
|
||||
|
||||
// This callback is called for each new segment
|
||||
if (!wparams.print_realtime) {
|
||||
wparams.new_segment_callback = whisper_print_segment_callback;
|
||||
wparams.new_segment_callback_user_data = &user_data;
|
||||
}
|
||||
|
||||
// Set progress callback
|
||||
wparams.progress_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
|
||||
ProgressWorker* worker = static_cast<ProgressWorker*>(user_data);
|
||||
worker->OnProgress(progress);
|
||||
};
|
||||
wparams.progress_callback_user_data = this;
|
||||
|
||||
// Abort mechanism example
|
||||
{
|
||||
static bool is_aborted = false; // Note: this should be atomic to avoid data races
|
||||
|
||||
wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
|
||||
bool is_aborted = *(bool*)user_data;
|
||||
return !is_aborted;
|
||||
};
|
||||
wparams.encoder_begin_callback_user_data = &is_aborted;
|
||||
}
|
||||
|
||||
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
|
||||
fprintf(stderr, "failed to process audio\n");
|
||||
return 10;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
result.resize(n_segments);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const char * text = whisper_full_get_segment_text(ctx, i);
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
result[i].emplace_back(to_timestamp(t0, params.comma_in_time));
|
||||
result[i].emplace_back(to_timestamp(t1, params.comma_in_time));
|
||||
result[i].emplace_back(text);
|
||||
}
|
||||
|
||||
whisper_print_timings(ctx);
|
||||
whisper_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
||||
whisper_params params;
|
||||
std::vector<std::vector<std::string>> result;
|
||||
};
|
||||
|
||||
|
||||
|
||||
Napi::Value whisper(const Napi::CallbackInfo& info) {
|
||||
Napi::Env env = info.Env();
|
||||
if (info.Length() <= 0 || !info[0].IsObject()) {
|
||||
@ -367,23 +332,6 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
||||
int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
|
||||
bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
|
||||
int32_t max_len = whisper_params.Get("max_len").As<Napi::Number>();
|
||||
|
||||
// support prompt
|
||||
std::string prompt = "";
|
||||
if (whisper_params.Has("prompt") && whisper_params.Get("prompt").IsString()) {
|
||||
prompt = whisper_params.Get("prompt").As<Napi::String>();
|
||||
}
|
||||
|
||||
// Add support for print_progress
|
||||
bool print_progress = false;
|
||||
if (whisper_params.Has("print_progress")) {
|
||||
print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
|
||||
}
|
||||
// Add support for progress_callback
|
||||
Napi::Function progress_callback;
|
||||
if (whisper_params.Has("progress_callback") && whisper_params.Get("progress_callback").IsFunction()) {
|
||||
progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
|
||||
}
|
||||
|
||||
Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
|
||||
std::vector<float> pcmf32_vec;
|
||||
@ -407,12 +355,9 @@ Napi::Value whisper(const Napi::CallbackInfo& info) {
|
||||
params.pcmf32 = pcmf32_vec;
|
||||
params.comma_in_time = comma_in_time;
|
||||
params.max_len = max_len;
|
||||
params.print_progress = print_progress;
|
||||
params.prompt = prompt;
|
||||
|
||||
Napi::Function callback = info[1].As<Napi::Function>();
|
||||
// Create a new Worker class with progress callback support
|
||||
ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
|
||||
Worker* worker = new Worker(callback, params);
|
||||
worker->Queue();
|
||||
return env.Undefined();
|
||||
}
|
||||
|
@ -19,9 +19,6 @@ const whisperParams = {
|
||||
no_timestamps: false,
|
||||
audio_ctx: 0,
|
||||
max_len: 0,
|
||||
progress_callback: (progress) => {
|
||||
console.log(`progress: ${progress}%`);
|
||||
}
|
||||
};
|
||||
|
||||
const arguments = process.argv.slice(2);
|
||||
|
@ -2,7 +2,7 @@
|
||||
|
||||
Benchmark the performance of whisper.cpp in the browser using WebAssembly
|
||||
|
||||
Link: https://ggerganov.github.io/whisper.cpp/bench.wasm
|
||||
Link: https://whisper.ggerganov.com/bench/
|
||||
|
||||
Terminal version: [examples/bench](/examples/bench)
|
||||
|
||||
@ -15,17 +15,7 @@ cd whisper.cpp
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake ..
|
||||
make -j
|
||||
```
|
||||
The example can then be started by running a local HTTP server:
|
||||
```console
|
||||
python3 examples/server.py
|
||||
```
|
||||
And then opening a browser to the following URL:
|
||||
http://localhost:8000/bench.wasm
|
||||
|
||||
To run the example in a different server, you need to copy the following files
|
||||
to the server's HTTP path:
|
||||
```
|
||||
# copy the produced page to your HTTP path
|
||||
cp bin/bench.wasm/* /path/to/html/
|
||||
cp bin/libbench.worker.js /path/to/html/
|
||||
|
@ -24,8 +24,6 @@
|
||||
overflow-x: scroll;
|
||||
}
|
||||
</style>
|
||||
<script src="../coi-serviceworker.js"></script>
|
||||
<link rel="icon" href="data:,">
|
||||
</head>
|
||||
<body>
|
||||
<div id="main-container">
|
||||
@ -38,10 +36,11 @@
|
||||
<br><br>
|
||||
|
||||
<b>More examples:</b>
|
||||
<a href="../">main</a> |
|
||||
<a href="../bench.wasm/">bench</a> |
|
||||
<a href="../stream.wasm">stream</a> |
|
||||
<a href="../command.wasm/">command</a> |
|
||||
<a href="https://whisper.ggerganov.com/">main</a> |
|
||||
<a href="https://whisper.ggerganov.com/bench">bench</a> |
|
||||
<a href="https://whisper.ggerganov.com/stream">stream</a> |
|
||||
<a href="https://whisper.ggerganov.com/command">command</a> |
|
||||
<a href="https://whisper.ggerganov.com/talk">talk</a> |
|
||||
|
||||
<br><br>
|
||||
|
||||
|
@ -4,7 +4,7 @@ A very basic tool for benchmarking the inference performance on your device. The
|
||||
the transformer on some random audio data and records the execution time. This way we can have an objective comparison
|
||||
of the performance of the model for various setups.
|
||||
|
||||
Benchmark results are tracked in the following Github issue: https://github.com/ggml-org/whisper.cpp/issues/89
|
||||
Benchmark results are tracked in the following Github issue: https://github.com/ggerganov/whisper.cpp/issues/89
|
||||
|
||||
```bash
|
||||
# run the bench too on the small.en model using 4 threads
|
||||
@ -40,7 +40,7 @@ system_info: n_threads = 4 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1 | WA
|
||||
|
||||
If you wish, you can submit these results here:
|
||||
|
||||
https://github.com/ggml-org/whisper.cpp/issues/89
|
||||
https://github.com/ggerganov/whisper.cpp/issues/89
|
||||
|
||||
Please include the following information:
|
||||
|
||||
|
@ -13,9 +13,7 @@
|
||||
#include <cstring>
|
||||
|
||||
#if defined(_WIN32)
|
||||
#ifndef NOMINMAX
|
||||
#define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
|
@ -1,146 +0,0 @@
|
||||
/*! coi-serviceworker v0.1.7 - Guido Zuidhof and contributors, licensed under MIT */
|
||||
let coepCredentialless = false;
|
||||
if (typeof window === 'undefined') {
|
||||
self.addEventListener("install", () => self.skipWaiting());
|
||||
self.addEventListener("activate", (event) => event.waitUntil(self.clients.claim()));
|
||||
|
||||
self.addEventListener("message", (ev) => {
|
||||
if (!ev.data) {
|
||||
return;
|
||||
} else if (ev.data.type === "deregister") {
|
||||
self.registration
|
||||
.unregister()
|
||||
.then(() => {
|
||||
return self.clients.matchAll();
|
||||
})
|
||||
.then(clients => {
|
||||
clients.forEach((client) => client.navigate(client.url));
|
||||
});
|
||||
} else if (ev.data.type === "coepCredentialless") {
|
||||
coepCredentialless = ev.data.value;
|
||||
}
|
||||
});
|
||||
|
||||
self.addEventListener("fetch", function (event) {
|
||||
const r = event.request;
|
||||
if (r.cache === "only-if-cached" && r.mode !== "same-origin") {
|
||||
return;
|
||||
}
|
||||
|
||||
const request = (coepCredentialless && r.mode === "no-cors")
|
||||
? new Request(r, {
|
||||
credentials: "omit",
|
||||
})
|
||||
: r;
|
||||
event.respondWith(
|
||||
fetch(request)
|
||||
.then((response) => {
|
||||
if (response.status === 0) {
|
||||
return response;
|
||||
}
|
||||
|
||||
const newHeaders = new Headers(response.headers);
|
||||
newHeaders.set("Cross-Origin-Embedder-Policy",
|
||||
coepCredentialless ? "credentialless" : "require-corp"
|
||||
);
|
||||
if (!coepCredentialless) {
|
||||
newHeaders.set("Cross-Origin-Resource-Policy", "cross-origin");
|
||||
}
|
||||
newHeaders.set("Cross-Origin-Opener-Policy", "same-origin");
|
||||
|
||||
return new Response(response.body, {
|
||||
status: response.status,
|
||||
statusText: response.statusText,
|
||||
headers: newHeaders,
|
||||
});
|
||||
})
|
||||
.catch((e) => console.error(e))
|
||||
);
|
||||
});
|
||||
|
||||
} else {
|
||||
(() => {
|
||||
const reloadedBySelf = window.sessionStorage.getItem("coiReloadedBySelf");
|
||||
window.sessionStorage.removeItem("coiReloadedBySelf");
|
||||
const coepDegrading = (reloadedBySelf == "coepdegrade");
|
||||
|
||||
// You can customize the behavior of this script through a global `coi` variable.
|
||||
const coi = {
|
||||
shouldRegister: () => !reloadedBySelf,
|
||||
shouldDeregister: () => false,
|
||||
coepCredentialless: () => true,
|
||||
coepDegrade: () => true,
|
||||
doReload: () => window.location.reload(),
|
||||
quiet: false,
|
||||
...window.coi
|
||||
};
|
||||
|
||||
const n = navigator;
|
||||
const controlling = n.serviceWorker && n.serviceWorker.controller;
|
||||
|
||||
// Record the failure if the page is served by serviceWorker.
|
||||
if (controlling && !window.crossOriginIsolated) {
|
||||
window.sessionStorage.setItem("coiCoepHasFailed", "true");
|
||||
}
|
||||
const coepHasFailed = window.sessionStorage.getItem("coiCoepHasFailed");
|
||||
|
||||
if (controlling) {
|
||||
// Reload only on the first failure.
|
||||
const reloadToDegrade = coi.coepDegrade() && !(
|
||||
coepDegrading || window.crossOriginIsolated
|
||||
);
|
||||
n.serviceWorker.controller.postMessage({
|
||||
type: "coepCredentialless",
|
||||
value: (reloadToDegrade || coepHasFailed && coi.coepDegrade())
|
||||
? false
|
||||
: coi.coepCredentialless(),
|
||||
});
|
||||
if (reloadToDegrade) {
|
||||
!coi.quiet && console.log("Reloading page to degrade COEP.");
|
||||
window.sessionStorage.setItem("coiReloadedBySelf", "coepdegrade");
|
||||
coi.doReload("coepdegrade");
|
||||
}
|
||||
|
||||
if (coi.shouldDeregister()) {
|
||||
n.serviceWorker.controller.postMessage({ type: "deregister" });
|
||||
}
|
||||
}
|
||||
|
||||
// If we're already coi: do nothing. Perhaps it's due to this script doing its job, or COOP/COEP are
|
||||
// already set from the origin server. Also if the browser has no notion of crossOriginIsolated, just give up here.
|
||||
if (window.crossOriginIsolated !== false || !coi.shouldRegister()) return;
|
||||
|
||||
if (!window.isSecureContext) {
|
||||
!coi.quiet && console.log("COOP/COEP Service Worker not registered, a secure context is required.");
|
||||
return;
|
||||
}
|
||||
|
||||
// In some environments (e.g. Firefox private mode) this won't be available
|
||||
if (!n.serviceWorker) {
|
||||
!coi.quiet && console.error("COOP/COEP Service Worker not registered, perhaps due to private mode.");
|
||||
return;
|
||||
}
|
||||
|
||||
n.serviceWorker.register(window.document.currentScript.src).then(
|
||||
(registration) => {
|
||||
!coi.quiet && console.log("COOP/COEP Service Worker registered", registration.scope);
|
||||
|
||||
registration.addEventListener("updatefound", () => {
|
||||
!coi.quiet && console.log("Reloading page to make use of updated COOP/COEP Service Worker.");
|
||||
window.sessionStorage.setItem("coiReloadedBySelf", "updatefound");
|
||||
coi.doReload();
|
||||
});
|
||||
|
||||
// If the registration is active, but it's not controlling the page
|
||||
if (registration.active && !n.serviceWorker.controller) {
|
||||
!coi.quiet && console.log("Reloading page to make use of COOP/COEP Service Worker.");
|
||||
window.sessionStorage.setItem("coiReloadedBySelf", "notcontrolling");
|
||||
coi.doReload();
|
||||
}
|
||||
},
|
||||
(err) => {
|
||||
!coi.quiet && console.error("COOP/COEP Service Worker failed to register:", err);
|
||||
}
|
||||
);
|
||||
})();
|
||||
}
|
@ -3,7 +3,7 @@
|
||||
This is a basic Voice Assistant example that accepts voice commands from the microphone.
|
||||
It runs in fully in the browser via WebAseembly.
|
||||
|
||||
Online demo: https://ggerganov.github.io/whisper.cpp/command.wasm
|
||||
Online demo: https://whisper.ggerganov.com/command/
|
||||
|
||||
Terminal version: [examples/command](/examples/command)
|
||||
|
||||
@ -15,18 +15,9 @@ git clone https://github.com/ggerganov/whisper.cpp
|
||||
cd whisper.cpp
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake ..
|
||||
make -j libcommand
|
||||
```
|
||||
The example can then be started by running a local HTTP server:
|
||||
```console
|
||||
python3 examples/server.py
|
||||
```
|
||||
And then opening a browser to the following URL:
|
||||
http://localhost:8000/command.wasm/
|
||||
make -j
|
||||
|
||||
To run the example in a different server, you need to copy the following files
|
||||
to the server's HTTP path:
|
||||
```
|
||||
# copy the produced page to your HTTP path
|
||||
cp bin/command.wasm/* /path/to/html/
|
||||
cp bin/libcommand.worker.js /path/to/html/
|
||||
```
|
||||
|
@ -24,8 +24,6 @@
|
||||
overflow-x: scroll;
|
||||
}
|
||||
</style>
|
||||
<script src="../coi-serviceworker.js"></script>
|
||||
<link rel="icon" href="data:,">
|
||||
</head>
|
||||
<body>
|
||||
<div id="main-container">
|
||||
@ -38,10 +36,11 @@
|
||||
<br><br>
|
||||
|
||||
<b>More examples:</b>
|
||||
<a href="../">main</a> |
|
||||
<a href="../bench.wasm/">bench</a> |
|
||||
<a href="../stream.wasm">stream</a> |
|
||||
<a href="../command.wasm/">command</a> |
|
||||
<a href="https://whisper.ggerganov.com/">main</a> |
|
||||
<a href="https://whisper.ggerganov.com/bench">bench</a> |
|
||||
<a href="https://whisper.ggerganov.com/stream">stream</a> |
|
||||
<a href="https://whisper.ggerganov.com/command">command</a> |
|
||||
<a href="https://whisper.ggerganov.com/talk">talk</a> |
|
||||
|
||||
<br><br>
|
||||
|
||||
|
@ -3,7 +3,7 @@
|
||||
// Speak short text commands to the microphone.
|
||||
// This program will detect your voice command and convert them to text.
|
||||
//
|
||||
// ref: https://github.com/ggml-org/whisper.cpp/issues/171
|
||||
// ref: https://github.com/ggerganov/whisper.cpp/issues/171
|
||||
//
|
||||
|
||||
#include "common-sdl.h"
|
||||
|
@ -247,6 +247,17 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string convert_to_utf8(const std::wstring & input) {
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
return converter.to_bytes(input);
|
||||
}
|
||||
|
||||
|
||||
std::wstring convert_to_wstring(const std::string & input) {
|
||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
|
||||
return converter.from_bytes(input);
|
||||
}
|
||||
|
||||
void gpt_split_words(std::string str, std::vector<std::string>& words) {
|
||||
const std::string pattern = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
||||
const std::regex re(pattern);
|
||||
|
@ -1,6 +1,4 @@
|
||||
add_executable(main ./deprecation-warning.cpp)
|
||||
add_executable(bench ./deprecation-warning.cpp)
|
||||
if (WHISPER_SDL2)
|
||||
add_executable(stream ./deprecation-warning.cpp)
|
||||
add_executable(command ./deprecation-warning.cpp)
|
||||
endif()
|
||||
add_executable(stream ./deprecation-warning.cpp)
|
||||
add_executable(command ./deprecation-warning.cpp)
|
||||
|
@ -2,7 +2,7 @@
|
||||
#
|
||||
# Transcribe audio livestream by feeding ffmpeg output to whisper.cpp at regular intervals
|
||||
# Idea by @semiformal-net
|
||||
# ref: https://github.com/ggml-org/whisper.cpp/issues/185
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/185
|
||||
#
|
||||
|
||||
set -eo pipefail
|
||||
|
@ -1,115 +0,0 @@
|
||||
import http.server
|
||||
import socketserver
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import urllib.parse
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent.absolute()
|
||||
DIRECTORY = os.path.join(SCRIPT_DIR, "../build-em/bin")
|
||||
DIRECTORY = os.path.abspath(DIRECTORY)
|
||||
|
||||
# The context root we want for all applications
|
||||
CONTEXT_ROOT = "/whisper.cpp"
|
||||
|
||||
class CustomHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, directory=DIRECTORY, **kwargs)
|
||||
|
||||
def do_GET(self):
|
||||
# Redirect root to the context root
|
||||
if self.path == '/':
|
||||
self.send_response(302)
|
||||
self.send_header('Location', CONTEXT_ROOT + '/')
|
||||
self.end_headers()
|
||||
return
|
||||
|
||||
# Handle requests under the context root
|
||||
if self.path.startswith(CONTEXT_ROOT):
|
||||
# Remove the context root prefix to get the actual path
|
||||
actual_path = self.path[len(CONTEXT_ROOT):]
|
||||
|
||||
if not actual_path:
|
||||
self.send_response(302)
|
||||
self.send_header('Location', CONTEXT_ROOT + '/')
|
||||
self.end_headers()
|
||||
return
|
||||
|
||||
if '.worker.js' in actual_path:
|
||||
worker_file = os.path.basename(actual_path)
|
||||
worker_path = os.path.join(DIRECTORY, worker_file)
|
||||
|
||||
if os.path.exists(worker_path):
|
||||
print(f"Found worker file: {worker_path}")
|
||||
self.path = '/' + worker_file
|
||||
else:
|
||||
print(f"Worker file not found: {worker_path}")
|
||||
|
||||
elif actual_path == '/':
|
||||
self.path = '/whisper.wasm/index.html'
|
||||
elif actual_path.startswith('/bench.wasm/') or actual_path.startswith('/command.wasm/') or actual_path.startswith('/stream.wasm/'):
|
||||
# Keep the path as is, just remove the context root
|
||||
self.path = actual_path
|
||||
# For all other paths under the context root
|
||||
else:
|
||||
# Check if this is a request to a file in whisper.wasm
|
||||
potential_file = os.path.join(DIRECTORY, 'whisper.wasm', actual_path.lstrip('/'))
|
||||
if os.path.exists(potential_file) and not os.path.isdir(potential_file):
|
||||
self.path = '/whisper.wasm' + actual_path
|
||||
else:
|
||||
# Try to resolve the file from the base directory
|
||||
potential_file = os.path.join(DIRECTORY, actual_path.lstrip('/'))
|
||||
if os.path.exists(potential_file):
|
||||
self.path = actual_path
|
||||
|
||||
# For direct requests to worker files (without context root as these
|
||||
# are in the build-em/bin directory
|
||||
elif '.worker.js' in self.path:
|
||||
worker_file = os.path.basename(self.path)
|
||||
worker_path = os.path.join(DIRECTORY, worker_file)
|
||||
|
||||
if os.path.exists(worker_path):
|
||||
self.path = '/' + worker_file
|
||||
|
||||
# Handle coi-serviceworker.js separately
|
||||
if 'coi-serviceworker.js' in self.path:
|
||||
worker_file = "coi-serviceworker.js"
|
||||
worker_path = os.path.join(SCRIPT_DIR, worker_file)
|
||||
if os.path.exists(worker_path):
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'application/javascript')
|
||||
self.end_headers()
|
||||
with open(worker_path, 'rb') as file:
|
||||
self.wfile.write(file.read())
|
||||
return
|
||||
else:
|
||||
print(f"Warning: Could not find {worker_path}")
|
||||
|
||||
return super().do_GET()
|
||||
|
||||
def end_headers(self):
|
||||
# Add required headers for SharedArrayBuffer
|
||||
self.send_header("Cross-Origin-Opener-Policy", "same-origin")
|
||||
self.send_header("Cross-Origin-Embedder-Policy", "require-corp")
|
||||
self.send_header("Access-Control-Allow-Origin", "*")
|
||||
super().end_headers()
|
||||
|
||||
PORT = 8000
|
||||
|
||||
# Enable address reuse
|
||||
class CustomServer(socketserver.TCPServer):
|
||||
allow_reuse_address = True
|
||||
|
||||
try:
|
||||
with CustomServer(("", PORT), CustomHTTPRequestHandler) as httpd:
|
||||
print(f"Serving directory '{DIRECTORY}' at http://localhost:{PORT}")
|
||||
print(f"Application context root: http://localhost:{PORT}{CONTEXT_ROOT}/")
|
||||
try:
|
||||
httpd.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
print("\nServer stopped.")
|
||||
# Force complete exit
|
||||
sys.exit(0)
|
||||
except OSError as e:
|
||||
print(f"Error: {e}")
|
||||
sys.exit(1)
|
@ -1024,11 +1024,6 @@ int main(int argc, char ** argv) {
|
||||
// check if the model is in the file system
|
||||
});
|
||||
|
||||
svr.Get(sparams.request_path + "/health", [&](const Request &, Response &res){
|
||||
const std::string health_response = "{\"status\":\"ok\"}";
|
||||
res.set_content(health_response, "application/json");
|
||||
});
|
||||
|
||||
svr.set_exception_handler([](const Request &, Response &res, std::exception_ptr ep) {
|
||||
const char fmt[] = "500 Internal Server Error\n%s";
|
||||
char buf[BUFSIZ];
|
||||
|
@ -13,17 +13,7 @@ cd whisper.cpp
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake ..
|
||||
make -j
|
||||
```
|
||||
The example can then be started by running a local HTTP server:
|
||||
```console
|
||||
python3 examples/server.py
|
||||
```
|
||||
And then opening a browser to the following URL:
|
||||
http://localhost:8000/stream.wasm
|
||||
|
||||
To run the example in a different server, you need to copy the following files
|
||||
to the server's HTTP path:
|
||||
```
|
||||
# copy the produced page to your HTTP path
|
||||
cp bin/stream.wasm/* /path/to/html/
|
||||
cp bin/libstream.worker.js /path/to/html/
|
||||
|
@ -24,8 +24,6 @@
|
||||
overflow-x: scroll;
|
||||
}
|
||||
</style>
|
||||
<script src="../coi-serviceworker.js"></script>
|
||||
<link rel="icon" href="data:,">
|
||||
</head>
|
||||
<body>
|
||||
<div id="main-container">
|
||||
@ -38,10 +36,11 @@
|
||||
<br><br>
|
||||
|
||||
<b>More examples:</b>
|
||||
<a href="../">main</a> |
|
||||
<a href="../bench.wasm/">bench</a> |
|
||||
<a href="../stream.wasm">stream</a> |
|
||||
<a href="../command.wasm/">command</a> |
|
||||
<a href="https://whisper.ggerganov.com/">main</a> |
|
||||
<a href="https://whisper.ggerganov.com/bench">bench</a> |
|
||||
<a href="https://whisper.ggerganov.com/stream">stream</a> |
|
||||
<a href="https://whisper.ggerganov.com/command">command</a> |
|
||||
<a href="https://whisper.ggerganov.com/talk">talk</a> |
|
||||
|
||||
<br><br>
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
#
|
||||
# Transcribe twitch.tv livestream by feeding audio input to whisper.cpp at regular intervals
|
||||
# Thanks to @keyehzy
|
||||
# ref: https://github.com/ggml-org/whisper.cpp/issues/209
|
||||
# ref: https://github.com/ggerganov/whisper.cpp/issues/209
|
||||
#
|
||||
# The script currently depends on the third-party tool "streamlink"
|
||||
# On Mac OS, you can install it via "brew install streamlink"
|
||||
|
@ -2,25 +2,15 @@ cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
project(whisper.cpp)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 17)
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
|
||||
|
||||
set(SOURCE_FILES
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-quants.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/unary-ops.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/binary-ops.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/vec.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ops.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-aarch64.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-alloc.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-backend.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-backend-reg.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-quants.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-threading.cpp
|
||||
${WHISPER_LIB_DIR}/src/whisper.cpp
|
||||
${CMAKE_SOURCE_DIR}/jni.c
|
||||
)
|
||||
@ -35,7 +25,6 @@ function(build_library target_name)
|
||||
)
|
||||
|
||||
target_link_libraries(${target_name} ${LOG_LIB} android)
|
||||
target_compile_definitions(${target_name} PUBLIC GGML_USE_CPU)
|
||||
|
||||
if (${target_name} STREQUAL "whisper_v8fp16_va")
|
||||
target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16)
|
||||
@ -68,4 +57,3 @@ include_directories(${WHISPER_LIB_DIR}/src)
|
||||
include_directories(${WHISPER_LIB_DIR}/include)
|
||||
include_directories(${WHISPER_LIB_DIR}/ggml/include)
|
||||
include_directories(${WHISPER_LIB_DIR}/ggml/src)
|
||||
include_directories(${WHISPER_LIB_DIR}/ggml/src/ggml-cpu)
|
||||
|
@ -16,10 +16,9 @@ allprojects {
|
||||
repositories {
|
||||
google()
|
||||
jcenter()
|
||||
maven { url "https://maven.aliyun.com/repository/gradle-plugin" }
|
||||
}
|
||||
}
|
||||
|
||||
task clean(type: Delete) {
|
||||
delete rootProject.buildDir
|
||||
}
|
||||
}
|
0
examples/whisper.android.java/gradlew
vendored
Executable file → Normal file
0
examples/whisper.android.java/gradlew
vendored
Executable file → Normal file
@ -32,10 +32,6 @@ if (NOT GGML_HOME)
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-hbm.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-quants.c
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ggml-cpu-traits.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/unary-ops.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/binary-ops.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/vec.cpp
|
||||
${WHISPER_LIB_DIR}/ggml/src/ggml-cpu/ops.cpp
|
||||
)
|
||||
endif()
|
||||
|
||||
@ -48,8 +44,6 @@ function(build_library target_name)
|
||||
${SOURCE_FILES}
|
||||
)
|
||||
|
||||
target_compile_definitions(${target_name} PUBLIC GGML_USE_CPU)
|
||||
|
||||
if (${target_name} STREQUAL "whisper_v8fp16_va")
|
||||
target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16)
|
||||
set(GGML_COMPILE_OPTIONS -march=armv8.2-a+fp16)
|
||||
|
@ -5,7 +5,7 @@
|
||||
# This simple script is called by Neovim to capture audio from the microphone and transcribe it with Whisper.
|
||||
# In order for this to work, you need to clone the whisper.cpp repo and build the 'stream' tool
|
||||
#
|
||||
# git clone https://github.com/ggml-org/whisper.cpp
|
||||
# git clone https://github.com/ggerganov/whisper.cpp
|
||||
# cd whisper.cpp
|
||||
# make stream
|
||||
#
|
||||
@ -31,7 +31,7 @@
|
||||
model="base.en"
|
||||
|
||||
# export the path to the whisper.cpp repo in the WHISPER_CPP_HOME env variable
|
||||
# https://github.com/ggml-org/whisper.cpp
|
||||
# https://github.com/ggerganov/whisper.cpp
|
||||
cd "${WHISPER_CPP_HOME}"
|
||||
|
||||
if [ ! -f ./stream ] ; then
|
||||
|
@ -11,25 +11,39 @@ https://user-images.githubusercontent.com/1991296/204126266-ce4177c6-6eca-4bd9-b
|
||||
|
||||
## Usage
|
||||
|
||||
This example uses the whisper.xcframework which needs to be built first using the following command:
|
||||
```bash
|
||||
./build-xcframework.sh
|
||||
```
|
||||
git clone https://github.com/ggerganov/whisper.cpp
|
||||
open whisper.cpp/examples/whisper.objc/whisper.objc.xcodeproj/
|
||||
|
||||
A model is also required to be downloaded and can be done using the following command:
|
||||
```bash
|
||||
./models/download-ggml-model.sh base.en
|
||||
```
|
||||
|
||||
If you don't want to convert a Core ML model, you can skip this step by creating dummy model:
|
||||
```bash
|
||||
# if you don't want to convert a Core ML model, you can skip this step by create dummy model
|
||||
mkdir models/ggml-base.en-encoder.mlmodelc
|
||||
```
|
||||
|
||||
Make sure to build the project in `Release`:
|
||||
|
||||
<img width="947" alt="image" src="https://user-images.githubusercontent.com/1991296/197382607-9e1e6d1b-79fa-496f-9d16-b71dc1535701.png">
|
||||
|
||||
Also, don't forget to add the `-DGGML_USE_ACCELERATE` compiler flag for `ggml.c` in Build Phases.
|
||||
This can significantly improve the performance of the transcription:
|
||||
|
||||
<img width="1072" alt="image" src="https://user-images.githubusercontent.com/1991296/208511239-8d7cdbd1-aa48-41b5-becd-ca288d53cc07.png">
|
||||
|
||||
## Core ML
|
||||
|
||||
Follow the [`Core ML support` section of readme](../../README.md#core-ml-support) to convert the model.
|
||||
That is all the needs to be done to use the Core ML model in the app. The converted model is a
|
||||
resource in the project and will be used if it is available. Note that the Core ML model is only
|
||||
used for the encoder, the decoder which is in the ggml model is still required so both need to
|
||||
be available.
|
||||
If you want to enable Core ML support, you can add the `-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK` compiler flag for `whisper.cpp` in Build Phases:
|
||||
|
||||
<img width="1072" alt="image" src="https://github.com/ggerganov/whisper.cpp/assets/3001525/103e8f57-6eb6-490d-a60c-f6cf6c319324">
|
||||
|
||||
Then follow the [`Core ML support` section of readme](../../README.md#core-ml-support) for convert the model.
|
||||
|
||||
In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
|
||||
|
||||
## Metal
|
||||
|
||||
You can also enable Metal to make the inference run on the GPU of your device. This might or might not be more efficient
|
||||
compared to Core ML depending on the model and device that you use.
|
||||
|
||||
To enable Metal, just add `-DGGML_USE_METAL` instead off the `-DWHISPER_USE_COREML` flag and you are ready.
|
||||
This will make both the Encoder and the Decoder run on the GPU.
|
||||
|
||||
If you want to run the Encoder with Core ML and the Decoder with Metal then simply add both `-DWHISPER_USE_COREML -DGGML_USE_METAL` flags. That's all!
|
||||
|
@ -7,6 +7,7 @@
|
||||
objects = {
|
||||
|
||||
/* Begin PBXBuildFile section */
|
||||
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */ = {isa = PBXBuildFile; fileRef = 184447182AB211A2007D6BFE /* ggml-alloc.c */; };
|
||||
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7A29052BDF00BD2A04 /* AppDelegate.m */; };
|
||||
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C7D29052BDF00BD2A04 /* SceneDelegate.m */; };
|
||||
18627C8129052BDF00BD2A04 /* ViewController.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8029052BDF00BD2A04 /* ViewController.m */; };
|
||||
@ -14,12 +15,23 @@
|
||||
18627C8629052BE000BD2A04 /* Assets.xcassets in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8529052BE000BD2A04 /* Assets.xcassets */; };
|
||||
18627C8929052BE000BD2A04 /* LaunchScreen.storyboard in Resources */ = {isa = PBXBuildFile; fileRef = 18627C8729052BE000BD2A04 /* LaunchScreen.storyboard */; };
|
||||
18627C8C29052BE000BD2A04 /* main.m in Sources */ = {isa = PBXBuildFile; fileRef = 18627C8B29052BE000BD2A04 /* main.m */; };
|
||||
18627C9429052C4900BD2A04 /* whisper.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9329052C4900BD2A04 /* whisper.cpp */; settings = {COMPILER_FLAGS = "-DWHISPER_USE_COREML -DWHISPER_COREML_ALLOW_FALLBACK -DGGML_USE_METAL"; }; };
|
||||
18627C9629052C5800BD2A04 /* ggml.c in Sources */ = {isa = PBXBuildFile; fileRef = 18627C9529052C5800BD2A04 /* ggml.c */; settings = {COMPILER_FLAGS = "-DGGML_USE_ACCELERATE -DGGML_USE_METAL"; }; };
|
||||
18627C9B29052CFF00BD2A04 /* ggml-base.en.bin in Resources */ = {isa = PBXBuildFile; fileRef = 18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */; };
|
||||
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1572AF556340044A204 /* ggml-backend.cpp */; };
|
||||
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18ABE1592AF556340044A204 /* ggml-quants.c */; };
|
||||
18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */ = {isa = PBXBuildFile; fileRef = 18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */; };
|
||||
18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */; };
|
||||
18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */; };
|
||||
18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.cpp */; settings = {COMPILER_FLAGS = "-x c++"; }; };
|
||||
18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */; };
|
||||
18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */; };
|
||||
433188B82D3A187C00E3FE79 /* gguf.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 433188B72D3A187C00E3FE79 /* gguf.cpp */; };
|
||||
437B63E22D36280C002A49EC /* ggml-cpu-traits.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 437B63E12D36280C002A49EC /* ggml-cpu-traits.cpp */; };
|
||||
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */; };
|
||||
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */ = {isa = PBXBuildFile; fileRef = 7FE342472A0C3FA20015A058 /* whisper-encoder.mm */; };
|
||||
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */ = {isa = PBXBuildFile; fileRef = 7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */; };
|
||||
7FE3424F2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc in Resources */ = {isa = PBXBuildFile; fileRef = 7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */; };
|
||||
DDE3609F2D87EA8C004EA223 /* whisper.xcframework in Frameworks */ = {isa = PBXBuildFile; fileRef = DDE3609E2D87EA8C004EA223 /* whisper.xcframework */; };
|
||||
DDE360A02D87EA8C004EA223 /* whisper.xcframework in Embed Frameworks */ = {isa = PBXBuildFile; fileRef = DDE3609E2D87EA8C004EA223 /* whisper.xcframework */; settings = {ATTRIBUTES = (CodeSignOnCopy, RemoveHeadersOnCopy, ); }; };
|
||||
/* End PBXBuildFile section */
|
||||
|
||||
/* Begin PBXCopyFilesBuildPhase section */
|
||||
@ -33,20 +45,11 @@
|
||||
name = "Copy Files";
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
DDE360A12D87EA8C004EA223 /* Embed Frameworks */ = {
|
||||
isa = PBXCopyFilesBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
dstPath = "";
|
||||
dstSubfolderSpec = 10;
|
||||
files = (
|
||||
DDE360A02D87EA8C004EA223 /* whisper.xcframework in Embed Frameworks */,
|
||||
);
|
||||
name = "Embed Frameworks";
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
/* End PBXCopyFilesBuildPhase section */
|
||||
|
||||
/* Begin PBXFileReference section */
|
||||
184447182AB211A2007D6BFE /* ggml-alloc.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-alloc.c"; path = "../../../ggml/src/ggml-alloc.c"; sourceTree = "<group>"; };
|
||||
184447192AB211A2007D6BFE /* ggml-alloc.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-alloc.h"; path = "../../../ggml/include/ggml-alloc.h"; sourceTree = "<group>"; };
|
||||
18627C7629052BDF00BD2A04 /* whisper.objc.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = whisper.objc.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
18627C7929052BDF00BD2A04 /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = "<group>"; };
|
||||
18627C7A29052BDF00BD2A04 /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = "<group>"; };
|
||||
@ -59,7 +62,34 @@
|
||||
18627C8829052BE000BD2A04 /* Base */ = {isa = PBXFileReference; lastKnownFileType = file.storyboard; name = Base; path = Base.lproj/LaunchScreen.storyboard; sourceTree = "<group>"; };
|
||||
18627C8A29052BE000BD2A04 /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
|
||||
18627C8B29052BE000BD2A04 /* main.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = main.m; sourceTree = "<group>"; };
|
||||
18627C9229052C2B00BD2A04 /* whisper.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = whisper.h; path = ../../../include/whisper.h; sourceTree = "<group>"; };
|
||||
18627C9329052C4900BD2A04 /* whisper.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = whisper.cpp; path = ../../../src/whisper.cpp; sourceTree = "<group>"; };
|
||||
18627C9529052C5800BD2A04 /* ggml.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = ggml.c; path = ../../../ggml/src/ggml.c; sourceTree = "<group>"; };
|
||||
18627C9729052C6600BD2A04 /* ggml.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = ggml.h; path = ../../../ggml/include/ggml.h; sourceTree = "<group>"; };
|
||||
18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */ = {isa = PBXFileReference; lastKnownFileType = archive.macbinary; name = "ggml-base.en.bin"; path = "../../../models/ggml-base.en.bin"; sourceTree = "<group>"; };
|
||||
18A275FF2C2A9563001C8D37 /* ggml-common.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-common.h"; path = "../../../ggml/src/ggml-common.h"; sourceTree = "<group>"; };
|
||||
18ABE1542AF556340044A204 /* ggml-quants.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-quants.h"; path = "../../../ggml/src/ggml-quants.h"; sourceTree = "<group>"; };
|
||||
18ABE1552AF556340044A204 /* ggml-backend.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend.h"; path = "../../../ggml/include/ggml-backend.h"; sourceTree = "<group>"; };
|
||||
18ABE1562AF556340044A204 /* ggml-backend-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-backend-impl.h"; path = "../../../ggml/src/ggml-backend-impl.h"; sourceTree = "<group>"; };
|
||||
18ABE1572AF556340044A204 /* ggml-backend.cpp */ = {isa = PBXFileReference; explicitFileType = sourcecode.cpp.cpp; fileEncoding = 4; name = "ggml-backend.cpp"; path = "../../../ggml/src/ggml-backend.cpp"; sourceTree = "<group>"; };
|
||||
18ABE1582AF556340044A204 /* ggml-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = "ggml-impl.h"; path = "../../../ggml/src/ggml-impl.h"; sourceTree = "<group>"; };
|
||||
18ABE1592AF556340044A204 /* ggml-quants.c */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.c; name = "ggml-quants.c"; path = "../../../ggml/src/ggml-quants.c"; sourceTree = "<group>"; };
|
||||
18B07DCB2D70411100B3B87C /* ggml-cpp.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpp.h"; path = "../../../ggml/include/ggml-cpp.h"; sourceTree = "<group>"; };
|
||||
18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.c"; sourceTree = "<group>"; };
|
||||
18E864AA2CE73C580094B8B3 /* ggml-cpu.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu.h"; path = "../../../ggml/include/ggml-cpu.h"; sourceTree = "<group>"; };
|
||||
18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-threading.h"; path = "../../../ggml/src/ggml-threading.h"; sourceTree = "<group>"; };
|
||||
18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-threading.cpp"; path = "../../../ggml/src/ggml-threading.cpp"; sourceTree = "<group>"; };
|
||||
18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-cpu.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu.cpp"; sourceTree = "<group>"; };
|
||||
18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-aarch64.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.h"; sourceTree = "<group>"; };
|
||||
18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-aarch64.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-aarch64.cpp"; sourceTree = "<group>"; };
|
||||
18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-impl.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-impl.h"; sourceTree = "<group>"; };
|
||||
18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-quants.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.h"; sourceTree = "<group>"; };
|
||||
18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.c; name = "ggml-cpu-quants.c"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-quants.c"; sourceTree = "<group>"; };
|
||||
18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-backend-reg.cpp"; path = "../../../ggml/src/ggml-backend-reg.cpp"; sourceTree = "<group>"; };
|
||||
433188B72D3A187C00E3FE79 /* gguf.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = gguf.cpp; path = ../../../ggml/src/gguf.cpp; sourceTree = "<group>"; };
|
||||
433188B92D3A18A400E3FE79 /* gguf.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = gguf.h; path = ../../../ggml/include/gguf.h; sourceTree = "<group>"; };
|
||||
437B63E02D36280C002A49EC /* ggml-cpu-traits.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; name = "ggml-cpu-traits.h"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-traits.h"; sourceTree = "<group>"; };
|
||||
437B63E12D36280C002A49EC /* ggml-cpu-traits.cpp */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.cpp.cpp; name = "ggml-cpu-traits.cpp"; path = "../../../ggml/src/ggml-cpu/ggml-cpu-traits.cpp"; sourceTree = "<group>"; };
|
||||
7FE342452A0C3FA20015A058 /* whisper-encoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-encoder-impl.m"; sourceTree = "<group>"; };
|
||||
7FE342462A0C3FA20015A058 /* whisper-encoder.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder.h"; sourceTree = "<group>"; };
|
||||
7FE342472A0C3FA20015A058 /* whisper-encoder.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = "whisper-encoder.mm"; sourceTree = "<group>"; };
|
||||
@ -67,7 +97,6 @@
|
||||
7FE342492A0C3FA20015A058 /* whisper-encoder-impl.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "whisper-encoder-impl.h"; sourceTree = "<group>"; };
|
||||
7FE3424A2A0C3FA20015A058 /* whisper-decoder-impl.m */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.objc; path = "whisper-decoder-impl.m"; sourceTree = "<group>"; };
|
||||
7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */ = {isa = PBXFileReference; lastKnownFileType = wrapper; name = "ggml-base.en-encoder.mlmodelc"; path = "../../../models/ggml-base.en-encoder.mlmodelc"; sourceTree = "<group>"; };
|
||||
DDE3609E2D87EA8C004EA223 /* whisper.xcframework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.xcframework; name = whisper.xcframework; path = "../../build-apple/whisper.xcframework"; sourceTree = "<group>"; };
|
||||
/* End PBXFileReference section */
|
||||
|
||||
/* Begin PBXFrameworksBuildPhase section */
|
||||
@ -75,7 +104,6 @@
|
||||
isa = PBXFrameworksBuildPhase;
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
DDE3609F2D87EA8C004EA223 /* whisper.xcframework in Frameworks */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
};
|
||||
@ -86,7 +114,6 @@
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
18627C7829052BDF00BD2A04 /* whisper.objc */,
|
||||
DDE3609D2D87EA8C004EA223 /* Frameworks */,
|
||||
18627C7729052BDF00BD2A04 /* Products */,
|
||||
);
|
||||
sourceTree = "<group>";
|
||||
@ -102,9 +129,38 @@
|
||||
18627C7829052BDF00BD2A04 /* whisper.objc */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
18B07DCB2D70411100B3B87C /* ggml-cpp.h */,
|
||||
433188B92D3A18A400E3FE79 /* gguf.h */,
|
||||
433188B72D3A187C00E3FE79 /* gguf.cpp */,
|
||||
18F8C0C62CEDF7AB00CAD607 /* ggml-backend-reg.cpp */,
|
||||
18F8C0BF2CEDF52700CAD607 /* ggml-cpu-aarch64.h */,
|
||||
18F8C0C02CEDF52700CAD607 /* ggml-cpu-aarch64.cpp */,
|
||||
18F8C0C12CEDF52700CAD607 /* ggml-cpu-impl.h */,
|
||||
437B63E02D36280C002A49EC /* ggml-cpu-traits.h */,
|
||||
437B63E12D36280C002A49EC /* ggml-cpu-traits.cpp */,
|
||||
18F8C0C22CEDF52700CAD607 /* ggml-cpu-quants.h */,
|
||||
18F8C0C32CEDF52700CAD607 /* ggml-cpu-quants.c */,
|
||||
18F8C0BD2CEDF50700CAD607 /* ggml-cpu.cpp */,
|
||||
18F8C0BA2CEDF4DC00CAD607 /* ggml-threading.h */,
|
||||
18F8C0BB2CEDF4DC00CAD607 /* ggml-threading.cpp */,
|
||||
18E864AA2CE73C580094B8B3 /* ggml-cpu.h */,
|
||||
18E864A82CE73C1E0094B8B3 /* ggml-cpu.c */,
|
||||
18A275FF2C2A9563001C8D37 /* ggml-common.h */,
|
||||
18ABE1562AF556340044A204 /* ggml-backend-impl.h */,
|
||||
18ABE1572AF556340044A204 /* ggml-backend.cpp */,
|
||||
18ABE1552AF556340044A204 /* ggml-backend.h */,
|
||||
18ABE1582AF556340044A204 /* ggml-impl.h */,
|
||||
18ABE1592AF556340044A204 /* ggml-quants.c */,
|
||||
18ABE1542AF556340044A204 /* ggml-quants.h */,
|
||||
184447182AB211A2007D6BFE /* ggml-alloc.c */,
|
||||
184447192AB211A2007D6BFE /* ggml-alloc.h */,
|
||||
7FE3424E2A0C418A0015A058 /* ggml-base.en-encoder.mlmodelc */,
|
||||
7FE342442A0C3FA20015A058 /* coreml */,
|
||||
18627C9A29052CFF00BD2A04 /* ggml-base.en.bin */,
|
||||
18627C9729052C6600BD2A04 /* ggml.h */,
|
||||
18627C9529052C5800BD2A04 /* ggml.c */,
|
||||
18627C9329052C4900BD2A04 /* whisper.cpp */,
|
||||
18627C9229052C2B00BD2A04 /* whisper.h */,
|
||||
18627C7929052BDF00BD2A04 /* AppDelegate.h */,
|
||||
18627C7A29052BDF00BD2A04 /* AppDelegate.m */,
|
||||
18627C7C29052BDF00BD2A04 /* SceneDelegate.h */,
|
||||
@ -134,14 +190,6 @@
|
||||
path = ../../../src/coreml;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
DDE3609D2D87EA8C004EA223 /* Frameworks */ = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
DDE3609E2D87EA8C004EA223 /* whisper.xcframework */,
|
||||
);
|
||||
name = Frameworks;
|
||||
sourceTree = "<group>";
|
||||
};
|
||||
/* End PBXGroup section */
|
||||
|
||||
/* Begin PBXNativeTarget section */
|
||||
@ -153,7 +201,6 @@
|
||||
18627C7329052BDF00BD2A04 /* Frameworks */,
|
||||
18627C7429052BDF00BD2A04 /* Resources */,
|
||||
184447202AB21B25007D6BFE /* Copy Files */,
|
||||
DDE360A12D87EA8C004EA223 /* Embed Frameworks */,
|
||||
);
|
||||
buildRules = (
|
||||
);
|
||||
@ -217,10 +264,24 @@
|
||||
buildActionMask = 2147483647;
|
||||
files = (
|
||||
18627C8129052BDF00BD2A04 /* ViewController.m in Sources */,
|
||||
18ABE15B2AF556340044A204 /* ggml-quants.c in Sources */,
|
||||
7FE3424C2A0C3FA20015A058 /* whisper-encoder.mm in Sources */,
|
||||
18627C9429052C4900BD2A04 /* whisper.cpp in Sources */,
|
||||
437B63E22D36280C002A49EC /* ggml-cpu-traits.cpp in Sources */,
|
||||
18627C9629052C5800BD2A04 /* ggml.c in Sources */,
|
||||
18627C7B29052BDF00BD2A04 /* AppDelegate.m in Sources */,
|
||||
7FE3424D2A0C3FA20015A058 /* whisper-decoder-impl.m in Sources */,
|
||||
18F8C0C72CEDF7AB00CAD607 /* ggml-backend-reg.cpp in Sources */,
|
||||
18F8C0BE2CEDF50700CAD607 /* ggml-cpu.cpp in Sources */,
|
||||
1844471A2AB211A2007D6BFE /* ggml-alloc.c in Sources */,
|
||||
18F8C0C42CEDF52700CAD607 /* ggml-cpu-aarch64.cpp in Sources */,
|
||||
18F8C0C52CEDF52700CAD607 /* ggml-cpu-quants.c in Sources */,
|
||||
18E864A92CE73C1E0094B8B3 /* ggml-cpu.c in Sources */,
|
||||
18ABE15A2AF556340044A204 /* ggml-backend.cpp in Sources */,
|
||||
18627C8C29052BE000BD2A04 /* main.m in Sources */,
|
||||
18627C7E29052BDF00BD2A04 /* SceneDelegate.m in Sources */,
|
||||
433188B82D3A187C00E3FE79 /* gguf.cpp in Sources */,
|
||||
18F8C0BC2CEDF4DC00CAD607 /* ggml-threading.cpp in Sources */,
|
||||
7FE3424B2A0C3FA20015A058 /* whisper-encoder-impl.m in Sources */,
|
||||
);
|
||||
runOnlyForDeploymentPostprocessing = 0;
|
||||
@ -298,7 +359,7 @@
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
HEADER_SEARCH_PATHS = "";
|
||||
HEADER_SEARCH_PATHS = ../../../ggml/src/;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
MTL_ENABLE_DEBUG_INFO = INCLUDE_SOURCE;
|
||||
MTL_FAST_MATH = YES;
|
||||
@ -352,7 +413,7 @@
|
||||
GCC_WARN_UNINITIALIZED_AUTOS = YES_AGGRESSIVE;
|
||||
GCC_WARN_UNUSED_FUNCTION = YES;
|
||||
GCC_WARN_UNUSED_VARIABLE = YES;
|
||||
HEADER_SEARCH_PATHS = "";
|
||||
HEADER_SEARCH_PATHS = ../../../ggml/src/;
|
||||
IPHONEOS_DEPLOYMENT_TARGET = 16.0;
|
||||
MTL_ENABLE_DEBUG_INFO = NO;
|
||||
MTL_FAST_MATH = YES;
|
||||
@ -376,7 +437,7 @@
|
||||
DEVELOPMENT_TEAM = P8JZH34X63;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
HEADER_SEARCH_PATHS = "";
|
||||
HEADER_SEARCH_PATHS = ../../../ggml/src/;
|
||||
INFOPLIST_FILE = whisper.objc/Info.plist;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
|
||||
@ -389,12 +450,10 @@
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
MTL_HEADER_SEARCH_PATHS = "";
|
||||
OTHER_CFLAGS = "-DGGML_USE_CPU=ON";
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
WARNING_CFLAGS = "-Wno-quoted-include-in-framework-header";
|
||||
};
|
||||
name = Debug;
|
||||
};
|
||||
@ -409,7 +468,7 @@
|
||||
DEVELOPMENT_TEAM = P8JZH34X63;
|
||||
GCC_WARN_64_TO_32_BIT_CONVERSION = NO;
|
||||
GENERATE_INFOPLIST_FILE = YES;
|
||||
HEADER_SEARCH_PATHS = "";
|
||||
HEADER_SEARCH_PATHS = ../../../ggml/src/;
|
||||
INFOPLIST_FILE = whisper.objc/Info.plist;
|
||||
INFOPLIST_KEY_UIApplicationSupportsIndirectInputEvents = YES;
|
||||
INFOPLIST_KEY_UILaunchStoryboardName = LaunchScreen;
|
||||
@ -422,12 +481,10 @@
|
||||
);
|
||||
MARKETING_VERSION = 1.0;
|
||||
MTL_HEADER_SEARCH_PATHS = "";
|
||||
OTHER_CFLAGS = "-DGGML_USE_CPU=ON";
|
||||
PRODUCT_BUNDLE_IDENTIFIER = "com.ggerganov.whisper-objc";
|
||||
PRODUCT_NAME = "$(TARGET_NAME)";
|
||||
SWIFT_EMIT_LOC_STRINGS = YES;
|
||||
TARGETED_DEVICE_FAMILY = "1,2";
|
||||
WARNING_CFLAGS = "-Wno-quoted-include-in-framework-header";
|
||||
};
|
||||
name = Release;
|
||||
};
|
||||
|
@ -6,8 +6,8 @@
|
||||
//
|
||||
|
||||
#import "ViewController.h"
|
||||
#import <whisper/whisper.h>
|
||||
|
||||
#import "whisper.h"
|
||||
|
||||
#define NUM_BYTES_PER_BUFFER 16*1024
|
||||
|
||||
@ -83,19 +83,6 @@ void AudioInputCallback(void * inUserData,
|
||||
stateInp.n_samples = 0;
|
||||
stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
|
||||
stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
|
||||
// Set up audio session
|
||||
NSError *error = nil;
|
||||
|
||||
[[AVAudioSession sharedInstance] setCategory:AVAudioSessionCategoryRecord error:&error];
|
||||
if (error) {
|
||||
NSLog(@"Error setting audio session category: %@", error);
|
||||
}
|
||||
|
||||
[[AVAudioSession sharedInstance] setActive:YES error:&error];
|
||||
if (error) {
|
||||
NSLog(@"Error activating audio session: %@", error);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
stateInp.isTranscribing = false;
|
||||
|
@ -33,21 +33,6 @@ sudo xcode-select -switch /Applications/Xcode.app/Contents/Developer
|
||||
|
||||
**Note:** Pay attention to the folder path: `whisper.swiftui.demo/Resources/models` is the appropriate directory to place resources whilst `whisper.swiftui.demo/Models` is related to actual code.
|
||||
|
||||
### Core ML support
|
||||
1. Follow all the steps in the `Usage` section, including adding the ggml model file.
|
||||
The ggml model file is required as the Core ML model is only used for the encoder. The
|
||||
decoder which is in the ggml model is still required.
|
||||
2. Follow the [`Core ML support` section of readme](../../README.md#core-ml-support) to convert the
|
||||
model.
|
||||
3. Add the Core ML model (`models/ggml-base.en-encoder.mlmodelc/`) to `whisper.swiftui.demo/Resources/models` **via Xcode**.
|
||||
|
||||
When the example starts running you should now see that it is using the Core ML model:
|
||||
```console
|
||||
whisper_init_state: loading Core ML model from '/Library/Developer/CoreSimulator/Devices/25E8C27D-0253-4281-AF17-C3F2A4D1D8F4/data/Containers/Bundle/Application/3ADA7D59-7B9C-43B4-A7E1-A87183FC546A/whisper.swiftui.app/models/ggml-base.en-encoder.mlmodelc'
|
||||
whisper_init_state: first run on a device may take a while ...
|
||||
whisper_init_state: Core ML model loaded
|
||||
```
|
||||
|
||||
[^1]: I recommend the tiny, base or small models for running on an iOS device.
|
||||
|
||||
[^2]: The `Release` build can boost performance of transcription. In this project, it also added `-O3 -DNDEBUG` to `Other C Flags`, but adding flags to app proj is not ideal in real world (applies to all C/C++ files), consider splitting xcodeproj in workspace in your own project.
|
||||
|
@ -32,9 +32,8 @@ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
|
||||
--bind \
|
||||
-s USE_PTHREADS=1 \
|
||||
-s PTHREAD_POOL_SIZE_STRICT=0 \
|
||||
-s INITIAL_MEMORY=512MB \
|
||||
-s MAXIMUM_MEMORY=2000MB \
|
||||
-s ALLOW_MEMORY_GROWTH=1 \
|
||||
-s INITIAL_MEMORY=2000MB \
|
||||
-s TOTAL_MEMORY=2000MB \
|
||||
-s FORCE_FILESYSTEM=1 \
|
||||
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
|
||||
${EXTRA_FLAGS} \
|
||||
|
@ -22,7 +22,7 @@ audio is limited to 120 seconds.
|
||||
|
||||
## Live demo
|
||||
|
||||
Link: https://ggerganov.github.io/whisper.cpp/
|
||||
Link: https://whisper.ggerganov.com
|
||||
|
||||

|
||||
|
||||
@ -30,22 +30,12 @@ Link: https://ggerganov.github.io/whisper.cpp/
|
||||
|
||||
```bash (v3.1.2)
|
||||
# build using Emscripten
|
||||
git clone https://github.com/ggml-org/whisper.cpp
|
||||
git clone https://github.com/ggerganov/whisper.cpp
|
||||
cd whisper.cpp
|
||||
mkdir build-em && cd build-em
|
||||
emcmake cmake ..
|
||||
make -j
|
||||
```
|
||||
The example can then be started by running a local HTTP server:
|
||||
```console
|
||||
python3 examples/server.py
|
||||
```
|
||||
And then opening a browser to the following URL:
|
||||
http://localhost:8000/whisper.wasm
|
||||
|
||||
To run the example in a different server, you need to copy the following files
|
||||
to the server's HTTP path:
|
||||
```
|
||||
# copy the produced page to your HTTP path
|
||||
cp bin/whisper.wasm/* /path/to/html/
|
||||
cp bin/libmain.worker.js /path/to/html/
|
||||
|
@ -65,14 +65,13 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
||||
}
|
||||
|
||||
struct whisper_full_params params = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
|
||||
bool is_multilingual = whisper_is_multilingual(g_contexts[index]);
|
||||
|
||||
params.print_realtime = true;
|
||||
params.print_progress = false;
|
||||
params.print_timestamps = true;
|
||||
params.print_special = false;
|
||||
params.translate = translate;
|
||||
params.language = is_multilingual ? strdup(lang.c_str()) : "en";
|
||||
params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
|
||||
params.n_threads = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency())));
|
||||
params.offset_ms = 0;
|
||||
|
||||
@ -103,13 +102,10 @@ EMSCRIPTEN_BINDINGS(whisper) {
|
||||
|
||||
// run the worker
|
||||
{
|
||||
g_worker = std::thread([index, params, pcmf32 = std::move(pcmf32), is_multilingual]() {
|
||||
g_worker = std::thread([index, params, pcmf32 = std::move(pcmf32)]() {
|
||||
whisper_reset_timings(g_contexts[index]);
|
||||
whisper_full(g_contexts[index], params, pcmf32.data(), pcmf32.size());
|
||||
whisper_print_timings(g_contexts[index]);
|
||||
if (is_multilingual) {
|
||||
free((void*)params.language);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
@ -24,8 +24,6 @@
|
||||
overflow-x: scroll;
|
||||
}
|
||||
</style>
|
||||
<script src="coi-serviceworker.js"></script>
|
||||
<link rel="icon" href="data:,">
|
||||
</head>
|
||||
<body>
|
||||
<div id="main-container">
|
||||
@ -49,9 +47,11 @@
|
||||
</ul>
|
||||
|
||||
<b>More examples:</b>
|
||||
<a href="bench.wasm/">bench</a> |
|
||||
<a href="stream.wasm">stream</a> |
|
||||
<a href="command.wasm/">command</a> |
|
||||
<a href="https://whisper.ggerganov.com/">main</a> |
|
||||
<a href="https://whisper.ggerganov.com/bench">bench</a> |
|
||||
<a href="https://whisper.ggerganov.com/stream">stream</a> |
|
||||
<a href="https://whisper.ggerganov.com/command">command</a> |
|
||||
<a href="https://whisper.ggerganov.com/talk">talk</a> |
|
||||
|
||||
<hr>
|
||||
|
||||
@ -614,7 +614,7 @@
|
||||
var nthreads = 8;
|
||||
|
||||
function changeThreads(value) {
|
||||
nthreads = parseInt(value, 10);
|
||||
nthreads = value;
|
||||
document.getElementById('threads-value').innerHTML = nthreads;
|
||||
}
|
||||
|
||||
|
@ -25,12 +25,12 @@
|
||||
# SOFTWARE.
|
||||
|
||||
# Small shell script to more easily automatically download and transcribe live stream VODs.
|
||||
# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggml-org/whisper.cpp
|
||||
# This uses YT-DLP, ffmpeg and the CPP version of Whisper: https://github.com/ggerganov/whisper.cpp
|
||||
# Use `./examples/yt-wsp.sh help` to print help info.
|
||||
#
|
||||
# Sample usage:
|
||||
#
|
||||
# git clone https://github.com/ggml-org/whisper.cpp
|
||||
# git clone https://github.com/ggerganov/whisper.cpp
|
||||
# cd whisper.cpp
|
||||
# make
|
||||
# ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890
|
||||
@ -44,7 +44,7 @@ SCRIPT_DIR="${SCRIPT_PATH%/*}"
|
||||
|
||||
################################################################################
|
||||
# Documentation on downloading models can be found in the whisper.cpp repo:
|
||||
# https://github.com/ggml-org/whisper.cpp/#usage
|
||||
# https://github.com/ggerganov/whisper.cpp/#usage
|
||||
#
|
||||
# note: unless a multilingual model is specified, WHISPER_LANG will be ignored
|
||||
# and the video will be transcribed as if the audio were in the English language
|
||||
@ -103,10 +103,10 @@ check_requirements() {
|
||||
fi;
|
||||
|
||||
if ! command -v "${WHISPER_EXECUTABLE}" &>/dev/null; then
|
||||
echo "The C++ implementation of Whisper is required: https://github.com/ggml-org/whisper.cpp"
|
||||
echo "The C++ implementation of Whisper is required: https://github.com/ggerganov/whisper.cpp"
|
||||
echo "Sample usage:";
|
||||
echo "";
|
||||
echo " git clone https://github.com/ggml-org/whisper.cpp";
|
||||
echo " git clone https://github.com/ggerganov/whisper.cpp";
|
||||
echo " cd whisper.cpp";
|
||||
echo " make";
|
||||
echo " ./examples/yt-wsp.sh https://www.youtube.com/watch?v=1234567890";
|
||||
|
@ -100,10 +100,6 @@ else()
|
||||
set(INS_ENB ON)
|
||||
endif()
|
||||
|
||||
message(DEBUG "GGML_NATIVE : ${GGML_NATIVE}")
|
||||
message(DEBUG "GGML_NATIVE_DEFAULT : ${GGML_NATIVE_DEFAULT}")
|
||||
message(DEBUG "INS_ENB : ${INS_ENB}")
|
||||
|
||||
option(GGML_CPU_HBM "ggml: use memkind for CPU HBM" OFF)
|
||||
option(GGML_CPU_AARCH64 "ggml: use runtime weight conversion of Q4_0 to Q4_X_X" ON)
|
||||
option(GGML_CPU_KLEIDIAI "ggml: use KleidiAI optimized kernels if applicable" OFF)
|
||||
@ -127,12 +123,10 @@ endif()
|
||||
option(GGML_LASX "ggml: enable lasx" ON)
|
||||
option(GGML_LSX "ggml: enable lsx" ON)
|
||||
option(GGML_RVV "ggml: enable rvv" ON)
|
||||
option(GGML_RV_ZFH "ggml: enable riscv zfh" OFF)
|
||||
option(GGML_VXE "ggml: enable vxe" ON)
|
||||
|
||||
option(GGML_CPU_ALL_VARIANTS "ggml: build all variants of the CPU backend (requires GGML_BACKEND_DL)" OFF)
|
||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
set(GGML_CPU_POWERPC_CPUTYPE "" CACHE STRING "ggml: CPU type for PowerPC")
|
||||
set(GGML_CPU_ARM_ARCH "" CACHE STRING "ggml: CPU architecture for ARM")
|
||||
|
||||
|
||||
if (WIN32)
|
||||
@ -192,7 +186,6 @@ option(GGML_OPENMP "ggml: use OpenMP"
|
||||
option(GGML_RPC "ggml: use RPC" OFF)
|
||||
option(GGML_SYCL "ggml: use SYCL" OFF)
|
||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
|
||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||
"ggml: sycl target device")
|
||||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||
@ -202,8 +195,6 @@ option(GGML_OPENCL "ggml: use OpenCL"
|
||||
option(GGML_OPENCL_PROFILING "ggml: use OpenCL profiling (increases overhead)" OFF)
|
||||
option(GGML_OPENCL_EMBED_KERNELS "ggml: embed kernels" ON)
|
||||
option(GGML_OPENCL_USE_ADRENO_KERNELS "ggml: use optimized kernels for Adreno" ON)
|
||||
set (GGML_OPENCL_TARGET_VERSION "300" CACHE STRING
|
||||
"gmml: OpenCL API version to target")
|
||||
|
||||
# toolchain for vulkan-shaders-gen
|
||||
set (GGML_VULKAN_SHADERS_GEN_TOOLCHAIN "" CACHE FILEPATH "ggml: toolchain file for vulkan-shaders-gen")
|
||||
|
@ -1,26 +0,0 @@
|
||||
function(ggml_get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
|
||||
if (CCID MATCHES "Clang")
|
||||
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
||||
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
||||
|
||||
if (
|
||||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CCID STREQUAL "GNU")
|
||||
set(C_FLAGS -Wdouble-promotion)
|
||||
set(CXX_FLAGS -Wno-array-bounds)
|
||||
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||
endfunction()
|
@ -5,7 +5,7 @@
|
||||
|
||||
set_and_check(GGML_INCLUDE_DIR "@PACKAGE_GGML_INCLUDE_INSTALL_DIR@")
|
||||
set_and_check(GGML_LIB_DIR "@PACKAGE_GGML_LIB_INSTALL_DIR@")
|
||||
#set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
||||
set_and_check(GGML_BIN_DIR "@PACKAGE_GGML_BIN_INSTALL_DIR@")
|
||||
|
||||
find_package(Threads REQUIRED)
|
||||
|
||||
|
@ -17,9 +17,7 @@ GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_rpc_buffer_type(const c
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_rpc_get_device_memory(const char * endpoint, size_t * free, size_t * total);
|
||||
|
||||
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint,
|
||||
const char * cache_dir,
|
||||
size_t free_mem, size_t total_mem);
|
||||
GGML_BACKEND_API void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem);
|
||||
|
||||
GGML_BACKEND_API ggml_backend_reg_t ggml_backend_rpc_reg(void);
|
||||
|
||||
|
@ -454,7 +454,6 @@ extern "C" {
|
||||
GGML_OP_RMS_NORM,
|
||||
GGML_OP_RMS_NORM_BACK,
|
||||
GGML_OP_GROUP_NORM,
|
||||
GGML_OP_L2_NORM,
|
||||
|
||||
GGML_OP_MUL_MAT,
|
||||
GGML_OP_MUL_MAT_ID,
|
||||
@ -503,7 +502,6 @@ extern "C" {
|
||||
GGML_OP_ADD_REL_POS,
|
||||
GGML_OP_RWKV_WKV6,
|
||||
GGML_OP_GATED_LINEAR_ATTN,
|
||||
GGML_OP_RWKV_WKV7,
|
||||
|
||||
GGML_OP_UNARY,
|
||||
|
||||
@ -1097,18 +1095,6 @@ extern "C" {
|
||||
int n_groups,
|
||||
float eps);
|
||||
|
||||
// l2 normalize along rows
|
||||
// used in rwkv v7
|
||||
GGML_API struct ggml_tensor * ggml_l2_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float eps);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_l2_norm_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float eps);
|
||||
|
||||
// a - x
|
||||
// b - dy
|
||||
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
||||
@ -1791,11 +1777,11 @@ extern "C" {
|
||||
|
||||
#define GGML_KQ_MASK_PAD 64
|
||||
|
||||
// q: [n_embd_k, n_batch, n_head, 1]
|
||||
// k: [n_embd_k, n_kv, n_head_kv, 1]
|
||||
// v: [n_embd_v, n_kv, n_head_kv, 1] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd_v, n_head, n_batch, 1] !! permuted !!
|
||||
// q: [n_embd, n_batch, n_head, 1]
|
||||
// k: [n_embd, n_kv, n_head_kv, 1]
|
||||
// v: [n_embd, n_kv, n_head_kv, 1] !! not transposed !!
|
||||
// mask: [n_kv, n_batch_pad, 1, 1] !! n_batch_pad = GGML_PAD(n_batch, GGML_KQ_MASK_PAD) !!
|
||||
// res: [n_embd, n_head, n_batch, 1] !! permuted !!
|
||||
GGML_API struct ggml_tensor * ggml_flash_attn_ext(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * q,
|
||||
@ -1904,16 +1890,6 @@ extern "C" {
|
||||
struct ggml_tensor * state,
|
||||
float scale);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_rwkv_wkv7(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * r,
|
||||
struct ggml_tensor * w,
|
||||
struct ggml_tensor * k,
|
||||
struct ggml_tensor * v,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
struct ggml_tensor * state);
|
||||
|
||||
// custom operators
|
||||
|
||||
typedef void (*ggml_unary_op_f32_t) (const int, float *, const float *);
|
||||
|
@ -1,5 +1,4 @@
|
||||
include(CheckCXXCompilerFlag)
|
||||
include("../cmake/common.cmake")
|
||||
|
||||
add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
|
||||
|
||||
@ -25,6 +24,33 @@ if (NOT MSVC)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
function(ggml_get_flags CCID CCVER)
|
||||
set(C_FLAGS "")
|
||||
set(CXX_FLAGS "")
|
||||
|
||||
if (CCID MATCHES "Clang")
|
||||
set(C_FLAGS -Wunreachable-code-break -Wunreachable-code-return)
|
||||
set(CXX_FLAGS -Wunreachable-code-break -Wunreachable-code-return -Wmissing-prototypes -Wextra-semi)
|
||||
|
||||
if (
|
||||
(CCID STREQUAL "Clang" AND CCVER VERSION_GREATER_EQUAL 3.8.0) OR
|
||||
(CCID STREQUAL "AppleClang" AND CCVER VERSION_GREATER_EQUAL 7.3.0)
|
||||
)
|
||||
list(APPEND C_FLAGS -Wdouble-promotion)
|
||||
endif()
|
||||
elseif (CCID STREQUAL "GNU")
|
||||
set(C_FLAGS -Wdouble-promotion)
|
||||
set(CXX_FLAGS -Wno-array-bounds)
|
||||
|
||||
if (CCVER VERSION_GREATER_EQUAL 8.1.0)
|
||||
list(APPEND CXX_FLAGS -Wextra-semi)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set(GF_C_FLAGS ${C_FLAGS} PARENT_SCOPE)
|
||||
set(GF_CXX_FLAGS ${CXX_FLAGS} PARENT_SCOPE)
|
||||
endfunction()
|
||||
|
||||
if (GGML_FATAL_WARNINGS)
|
||||
if (CMAKE_CXX_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
|
||||
list(APPEND C_FLAGS -Werror)
|
||||
@ -65,7 +91,7 @@ if (GGML_LTO)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAUNCHER)
|
||||
if (GGML_CCACHE)
|
||||
find_program(GGML_CCACHE_FOUND ccache)
|
||||
find_program(GGML_SCCACHE_FOUND sccache)
|
||||
|
||||
@ -76,11 +102,7 @@ if (GGML_CCACHE AND NOT CMAKE_C_COMPILER_LAUNCHER AND NOT CMAKE_CXX_COMPILER_LAU
|
||||
set(GGML_CCACHE_VARIANT sccache)
|
||||
endif()
|
||||
# TODO: should not be set globally
|
||||
if (GGML_SYCL AND GGML_CCACHE_FOUND AND WIN32)
|
||||
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "ccache compiler_type=icl")
|
||||
else ()
|
||||
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
|
||||
endif ()
|
||||
set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${GGML_CCACHE_VARIANT}")
|
||||
set(ENV{CCACHE_SLOPPINESS} time_macros)
|
||||
message(STATUS "${GGML_CCACHE_VARIANT} found, compilation results will be cached. Disable with GGML_CCACHE=OFF.")
|
||||
else()
|
||||
@ -329,10 +351,6 @@ if (CMAKE_SYSTEM_NAME MATCHES "Android")
|
||||
target_link_libraries(ggml-base PRIVATE dl)
|
||||
endif()
|
||||
|
||||
if(CMAKE_SYSTEM_NAME MATCHES "visionOS")
|
||||
target_compile_definitions(ggml-base PUBLIC _DARWIN_C_SOURCE)
|
||||
endif()
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
foreach (target ggml-base ggml)
|
||||
set_target_properties(${target} PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
@ -76,14 +76,7 @@ namespace fs = std::filesystem;
|
||||
static std::string path_str(const fs::path & path) {
|
||||
std::string u8path;
|
||||
try {
|
||||
#if defined(__cpp_lib_char8_t)
|
||||
// C++20 and later: u8string() returns std::u8string
|
||||
std::u8string u8str = path.u8string();
|
||||
u8path = std::string(reinterpret_cast<const char*>(u8str.c_str()));
|
||||
#else
|
||||
// C++17: u8string() returns std::string
|
||||
u8path = path.u8string();
|
||||
#endif
|
||||
} catch (...) {
|
||||
}
|
||||
return u8path;
|
||||
@ -497,7 +490,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
search_paths.push_back(get_executable_path());
|
||||
search_paths.push_back(fs::current_path());
|
||||
} else {
|
||||
search_paths.push_back(fs::u8path(user_search_path));
|
||||
search_paths.push_back(user_search_path);
|
||||
}
|
||||
|
||||
int best_score = 0;
|
||||
@ -511,9 +504,9 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
fs::directory_iterator dir_it(search_path, fs::directory_options::skip_permission_denied);
|
||||
for (const auto & entry : dir_it) {
|
||||
if (entry.is_regular_file()) {
|
||||
auto filename = entry.path().filename();
|
||||
auto ext = entry.path().extension();
|
||||
if (filename.native().find(file_prefix) == 0 && ext == file_extension) {
|
||||
auto filename = entry.path().filename().native();
|
||||
auto ext = entry.path().extension().native();
|
||||
if (filename.find(file_prefix) == 0 && ext == file_extension) {
|
||||
dl_handle_ptr handle { dl_load_library(entry) };
|
||||
if (!handle && !silent) {
|
||||
GGML_LOG_ERROR("%s: failed to load %s\n", __func__, path_str(entry.path()).c_str());
|
||||
@ -544,7 +537,7 @@ static ggml_backend_reg_t ggml_backend_load_best(const char * name, bool silent,
|
||||
// try to load the base backend
|
||||
for (const auto & search_path : search_paths) {
|
||||
fs::path filename = backend_filename_prefix().native() + name_path.native() + backend_filename_extension().native();
|
||||
fs::path path = search_path / filename;
|
||||
fs::path path = search_path.native() + filename.native();
|
||||
if (fs::exists(path)) {
|
||||
return get_reg().load_backend(path, silent);
|
||||
}
|
||||
|
@ -51,11 +51,13 @@ if (CANN_INSTALL_DIR)
|
||||
${CANN_INSTALL_DIR}/acllib/include
|
||||
)
|
||||
|
||||
add_subdirectory(kernels)
|
||||
list(APPEND CANN_LIBRARIES
|
||||
ascendcl
|
||||
nnopbase
|
||||
opapi
|
||||
acl_op_compiler
|
||||
ascendc_kernels
|
||||
)
|
||||
|
||||
file(GLOB GGML_SOURCES_CANN "*.cpp")
|
||||
|
@ -30,7 +30,6 @@
|
||||
#include <aclnnop/aclnn_copy.h>
|
||||
#include <aclnnop/aclnn_cos.h>
|
||||
#include <aclnnop/aclnn_div.h>
|
||||
#include <aclnnop/aclnn_embedding.h>
|
||||
#include <aclnnop/aclnn_exp.h>
|
||||
#include <aclnnop/aclnn_fill_scalar.h>
|
||||
#include <aclnnop/aclnn_group_norm.h>
|
||||
@ -59,6 +58,7 @@
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-impl.h"
|
||||
#include "kernels/ascendc_kernels.h"
|
||||
|
||||
#define GGML_COMMON_DECL_C
|
||||
|
||||
@ -99,35 +99,6 @@ static void aclnn_repeat(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
ACL_CHECK(aclDestroyIntArray(repeats));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Casts the elements of a tensor to a specified data type using the CANN backend.
|
||||
*
|
||||
* @details This function performs a type conversion on the elements of the input tensor `acl_src`
|
||||
* and stores the results in the destination tensor `acl_dst`. The conversion type is
|
||||
* determined based on the `dst` tensor's data type.
|
||||
*
|
||||
* @param ctx The context for the CANN backend operations.
|
||||
* @param acl_src The source tensor whose elements will be cast.
|
||||
* @param acl_dst The destination tensor that will store the casted elements.
|
||||
* @param dst The ggml tensor specifying the target data type.
|
||||
*/
|
||||
static void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
aclTensor* acl_dst, ggml_tensor* dst) {
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src,
|
||||
ggml_cann_type_mapping(dst->type),
|
||||
acl_dst, &workspaceSize, &executor));
|
||||
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(), workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
}
|
||||
|
||||
void ggml_cann_repeat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src = dst->src[0];
|
||||
GGML_ASSERT(ggml_can_repeat(src, dst));
|
||||
@ -918,76 +889,173 @@ static void cann_copy(ggml_backend_cann_context& ctx, aclTensor* acl_src,
|
||||
}
|
||||
|
||||
void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src = dst->src[0];
|
||||
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src0);
|
||||
aclTensor* acl_src = ggml_cann_create_tensor(src);
|
||||
aclTensor* acl_dst = ggml_cann_create_tensor(dst);
|
||||
if (ggml_are_same_shape(src0, dst)) {
|
||||
if (dst->type == src0->type) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
} else {
|
||||
aclnn_cast(ctx, acl_src, acl_dst, dst);
|
||||
}
|
||||
} else {
|
||||
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst)) {
|
||||
if (dst->type == src0->type) {
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
dst->data, cpy_size, src0->data, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||
return;
|
||||
} else {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(),
|
||||
ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ACL_CHECK(aclrtMemcpyAsync(
|
||||
dst->data, cpy_size, src_trans_buffer, cpy_size,
|
||||
ACL_MEMCPY_DEVICE_TO_DEVICE, ctx.stream()));
|
||||
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
||||
return;
|
||||
}
|
||||
} else if (ggml_is_contiguous(dst)) {
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(dst) * ggml_type_size(dst->type));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = ggml_type_size(dst->type);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ggml_cann_type_mapping(dst->type),
|
||||
ggml_type_size(dst->type), src0->ne, src_trans_nb,
|
||||
GGML_MAX_DIMS);
|
||||
ggml_cann_pool_alloc src_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
src->extra = src_extra_allocator.get();
|
||||
dst->extra = dst_extra_allocator.get();
|
||||
ACL_CHECK(aclrtMemcpyAsync(src->extra, sizeof(ggml_tensor), src,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
|
||||
aclnn_cast(ctx, acl_src, src_trans_tensor, dst);
|
||||
|
||||
size_t cpy_size = ggml_nbytes(dst);
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->data, cpy_size, src_trans_buffer,
|
||||
cpy_size, ACL_MEMCPY_DEVICE_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
||||
return;
|
||||
} else {
|
||||
GGML_ABORT("Unsupport dst is not tontiguous.");
|
||||
}
|
||||
if ((dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32) &&
|
||||
ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
// TODO: simplify
|
||||
if (src->type == GGML_TYPE_F16) {
|
||||
if (dst->type == GGML_TYPE_Q8_0) {
|
||||
aclrtlaunch_ascendc_quantize_f16_q8_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_Q4_0) {
|
||||
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp16(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
// TODO
|
||||
GGML_ABORT("fatal error");
|
||||
} else if (src->type == GGML_TYPE_F32) {
|
||||
// TODO: if (src0->type == dst->type && ne00 == ne0 && nb00 == type_size
|
||||
// && nb0 == type_size)
|
||||
if (dst->type == GGML_TYPE_Q8_0) {
|
||||
aclrtlaunch_ascendc_quantize_f32_q8_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_Q4_0) {
|
||||
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
|
||||
24, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne);
|
||||
return;
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F32) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp32(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
// TODO: dst not contiguous
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
if (dst->type == GGML_TYPE_F16) {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
if (ggml_is_contiguous(dst)) {
|
||||
const size_t src_type_size = ggml_type_size(src->type);
|
||||
if (src->nb[0] == src_type_size) {
|
||||
// src0 is contigous on first dimension, copy by rows
|
||||
int64_t rows_num = ggml_nrows(src);
|
||||
aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
|
||||
rows_num, ctx.stream(), src->data, dst->data,
|
||||
((ggml_tensor*)src->extra)->ne,
|
||||
((ggml_tensor*)src->extra)->nb,
|
||||
((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
// TODO
|
||||
GGML_ABORT("fatal error");
|
||||
} else {
|
||||
if (ggml_are_same_shape(src, dst)) {
|
||||
cann_copy(ctx, acl_src, acl_dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src));
|
||||
ACL_CHECK(aclDestroyTensor(acl_dst));
|
||||
return;
|
||||
}
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
@ -2310,168 +2378,85 @@ void ggml_cann_softmax(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ACL_CHECK(aclDestroyTensor(tmp_mask_tensor));
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Performs embedding operation on a 4D tensor using the CANN backend.
|
||||
*
|
||||
* This function extracts slices from the source tensor (`src_buffer`),
|
||||
* index tensor (`index`), and destination tensor (`dst`), and performs an
|
||||
* embedding operation on them. The embedding operation is applied by iterating
|
||||
* over the last two dimensions of the source tensor, creating the necessary
|
||||
* tensors for the source, index, and output, and executing the embedding operation.
|
||||
*
|
||||
* @param ctx The context for CANN backend operations.
|
||||
* @param src_buffer The source buffer holding the data for the source tensor.
|
||||
* @param src_ne The dimensions of the source tensor.
|
||||
* @param src_nb The strides (byte offsets) of the source tensor.
|
||||
* @param index The index tensor used in the embedding operation.
|
||||
* @param dst The destination tensor where the result will be stored.
|
||||
*/
|
||||
static void aclnn_embedding_4d(ggml_backend_cann_context& ctx, void* src_buffer,
|
||||
int64_t* src_ne, size_t* src_nb, ggml_tensor* index,
|
||||
ggml_tensor* dst) {
|
||||
for (int64_t i = 0; i < src_ne[3]; i++) {
|
||||
for (int64_t j = 0; j < src_ne[2]; j++) {
|
||||
// src
|
||||
int64_t acl_src_ne[2] = {src_ne[0], src_ne[1]};
|
||||
size_t acl_src_nb[2] = {src_nb[0], src_nb[1]};
|
||||
aclTensor* acl_src_tensor = ggml_cann_create_tensor(
|
||||
(char*)src_buffer + i * src_nb[3] + j * src_nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_src_ne, acl_src_nb, 2);
|
||||
|
||||
// index
|
||||
int64_t acl_index_ne[1] = {index->ne[0]};
|
||||
size_t acl_index_nb[1] = {index->nb[0]};
|
||||
aclTensor* acl_index = ggml_cann_create_tensor(
|
||||
(char*)index->data + i * index->nb[2] + j * index->nb[1],
|
||||
ggml_cann_type_mapping(index->type), ggml_element_size(index),
|
||||
acl_index_ne, acl_index_nb, 1);
|
||||
|
||||
// out
|
||||
int64_t acl_out_ne[2] = {dst->ne[0], dst->ne[1]};
|
||||
size_t acl_out_nb[2] = {dst->nb[0], dst->nb[1]};
|
||||
aclTensor* acl_out = ggml_cann_create_tensor(
|
||||
(char*)dst->data + i * dst->nb[3] + j * dst->nb[2],
|
||||
ggml_cann_type_mapping(dst->type), ggml_element_size(dst),
|
||||
acl_out_ne, acl_out_nb, 2);
|
||||
|
||||
uint64_t workspaceSize = 0;
|
||||
aclOpExecutor* executor;
|
||||
void* workspaceAddr = nullptr;
|
||||
|
||||
ACL_CHECK(aclnnEmbeddingGetWorkspaceSize(
|
||||
acl_src_tensor, acl_index, acl_out, &workspaceSize, &executor));
|
||||
|
||||
if (workspaceSize > 0) {
|
||||
ggml_cann_pool_alloc workspace_allocator(ctx.pool(),
|
||||
workspaceSize);
|
||||
workspaceAddr = workspace_allocator.get();
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnEmbedding(workspaceAddr, workspaceSize, executor,
|
||||
ctx.stream()));
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(acl_src_tensor));
|
||||
ACL_CHECK(aclDestroyTensor(acl_index));
|
||||
ACL_CHECK(aclDestroyTensor(acl_out));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
|
||||
ggml_tensor* src0 = dst->src[0]; // src
|
||||
ggml_tensor* src1 = dst->src[1]; // index
|
||||
ggml_tensor* src0 = dst->src[0];
|
||||
ggml_tensor* src1 = dst->src[1];
|
||||
|
||||
ggml_cann_pool_alloc src0_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
ggml_cann_pool_alloc src1_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
ggml_cann_pool_alloc dst_extra_allocator(ctx.pool(), sizeof(ggml_tensor));
|
||||
src0->extra = src0_extra_allocator.get();
|
||||
src1->extra = src1_extra_allocator.get();
|
||||
dst->extra = dst_extra_allocator.get();
|
||||
ACL_CHECK(aclrtMemcpyAsync(src0->extra, sizeof(ggml_tensor), src0,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclrtMemcpyAsync(src1->extra, sizeof(ggml_tensor), src1,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
ACL_CHECK(aclrtMemcpyAsync(dst->extra, sizeof(ggml_tensor), dst,
|
||||
sizeof(ggml_tensor), ACL_MEMCPY_HOST_TO_DEVICE,
|
||||
ctx.stream()));
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32: {
|
||||
aclnn_embedding_4d(ctx, src0->data, src0->ne, src0->nb, src1,
|
||||
dst);
|
||||
#ifdef ASCEND_310P
|
||||
// Special operation for get_row_f32 kernel of 310P: clear the
|
||||
// content of dest data buffer when row is not aligned to 32 bytes
|
||||
if ((src0->ne[0] % 8) != 0) {
|
||||
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] *
|
||||
src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
|
||||
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
||||
}
|
||||
#endif
|
||||
aclrtlaunch_ascendc_get_row_f32(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src0->extra)->nb,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_F16: {
|
||||
aclTensor* acl_src0 = ggml_cann_create_tensor(src0);
|
||||
ggml_cann_pool_alloc src_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
|
||||
void* src_trans_buffer = src_buffer_allocator.get();
|
||||
size_t src_trans_nb[GGML_MAX_DIMS];
|
||||
src_trans_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
src_trans_nb[i] = src_trans_nb[i - 1] * src0->ne[i - 1];
|
||||
#ifdef ASCEND_310P
|
||||
// Special operation for get_row_f16 kernel of 310P: clear the
|
||||
// content of dest data buffer when row is not aligned to 32 bytes
|
||||
if ((src0->ne[0] % 16) != 0) {
|
||||
size_t dst_len =
|
||||
src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] *
|
||||
ggml_type_size(
|
||||
GGML_TYPE_F32); // out is also f32, even input is f16
|
||||
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
|
||||
}
|
||||
aclTensor* src_trans_tensor = ggml_cann_create_tensor(
|
||||
src_trans_buffer, ACL_FLOAT, ggml_type_size(dst->type),
|
||||
src0->ne, src_trans_nb, GGML_MAX_DIMS);
|
||||
aclnn_cast(ctx, acl_src0, src_trans_tensor, dst);
|
||||
aclnn_embedding_4d(ctx, src_trans_buffer, src0->ne,
|
||||
src_trans_nb, src1, dst);
|
||||
ACL_CHECK(aclDestroyTensor(acl_src0));
|
||||
ACL_CHECK(aclDestroyTensor(src_trans_tensor));
|
||||
#endif
|
||||
aclrtlaunch_ascendc_get_row_f16(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src0->extra)->nb,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
break;
|
||||
}
|
||||
case GGML_TYPE_Q8_0: {
|
||||
// add 1 dim for bcast mul.
|
||||
size_t weight_nb[GGML_MAX_DIMS + 1], scale_nb[GGML_MAX_DIMS + 1],
|
||||
dequant_nb[GGML_MAX_DIMS + 1];
|
||||
int64_t weight_ne[GGML_MAX_DIMS + 1], scale_ne[GGML_MAX_DIMS + 1],
|
||||
*dequant_ne;
|
||||
int64_t scale_offset = 0;
|
||||
|
||||
// [3,4,5,64] -> [3,4,5,2,32]
|
||||
weight_ne[0] = QK8_0;
|
||||
weight_ne[1] = src0->ne[0] / QK8_0;
|
||||
weight_nb[0] = sizeof(int8_t);
|
||||
weight_nb[1] = weight_nb[0] * weight_ne[0];
|
||||
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
||||
weight_ne[i] = src0->ne[i - 1];
|
||||
weight_nb[i] = weight_nb[i - 1] * weight_ne[i - 1];
|
||||
}
|
||||
|
||||
// [3,4,5,64] -> [3,4,5,2,1]
|
||||
scale_ne[0] = 1;
|
||||
scale_ne[1] = src0->ne[0] / QK8_0;
|
||||
scale_nb[0] = sizeof(uint16_t);
|
||||
scale_nb[1] = scale_nb[0] * scale_ne[0];
|
||||
for (int i = 2; i < GGML_MAX_DIMS + 1; i++) {
|
||||
scale_ne[i] = src0->ne[i - 1];
|
||||
scale_nb[i] = scale_nb[i - 1] * scale_ne[i - 1];
|
||||
}
|
||||
|
||||
// [3,4,5,64] -> [3,4,5,2,32]
|
||||
dequant_ne = weight_ne;
|
||||
dequant_nb[0] = sizeof(float_t);
|
||||
for (int i = 1; i < GGML_MAX_DIMS + 1; i++) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * dequant_ne[i - 1];
|
||||
}
|
||||
|
||||
scale_offset = ggml_nelements(src0) * sizeof(int8_t);
|
||||
ggml_cann_pool_alloc dequant_buffer_allocator(
|
||||
ctx.pool(), ggml_nelements(src0) * sizeof(float_t));
|
||||
|
||||
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
|
||||
src0->data, ACL_INT8, sizeof(int8_t), weight_ne, weight_nb,
|
||||
GGML_MAX_DIMS + 1);
|
||||
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
|
||||
src0->data, ACL_FLOAT16, sizeof(float16_t), scale_ne, scale_nb,
|
||||
GGML_MAX_DIMS + 1, ACL_FORMAT_ND, scale_offset);
|
||||
aclTensor* dequant_tensor = ggml_cann_create_tensor(
|
||||
dequant_buffer_allocator.get(), ACL_FLOAT, sizeof(float_t),
|
||||
dequant_ne, dequant_nb, GGML_MAX_DIMS + 1);
|
||||
|
||||
aclnn_mul(ctx, acl_weight_tensor, acl_scale_tensor, dequant_tensor);
|
||||
dequant_nb[0] = sizeof(float_t);
|
||||
dequant_ne = src0->ne;
|
||||
for (int i = 1; i < GGML_MAX_DIMS; i++) {
|
||||
dequant_nb[i] = dequant_nb[i - 1] * src0->ne[i - 1];
|
||||
}
|
||||
|
||||
aclnn_embedding_4d(ctx, dequant_buffer_allocator.get(),
|
||||
dequant_ne, dequant_nb, src1, dst);
|
||||
|
||||
ACL_CHECK(aclDestroyTensor(dequant_tensor));
|
||||
case GGML_TYPE_Q4_0:
|
||||
aclrtlaunch_ascendc_get_row_q4_0(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
break;
|
||||
case GGML_TYPE_Q8_0:
|
||||
aclrtlaunch_ascendc_get_row_q8_0(
|
||||
24, ctx.stream(), src0->data, src1->data, dst->data,
|
||||
((ggml_tensor*)src0->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->ne,
|
||||
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
|
||||
((ggml_tensor*)dst->extra)->nb);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
GGML_ABORT("Unsupported tensor type for GGML_OP_GET_ROWS");
|
||||
GGML_ABORT("fatal error");
|
||||
break;
|
||||
}
|
||||
}
|
||||
@ -2805,15 +2790,11 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
(char*)output_buffer + batch1 * output_stride, ACL_FLOAT16,
|
||||
output_elem_size, output_ne, output_nb, 2, ACL_FORMAT_ND,
|
||||
output_ne_offset);
|
||||
int64_t antiquantGroupSize = 0;
|
||||
if (src0->ne[0] > QK8_0) {
|
||||
antiquantGroupSize = QK8_0;
|
||||
}
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor, nullptr,
|
||||
nullptr, nullptr, nullptr, antiquantGroupSize,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
nullptr, nullptr, nullptr, QK8_0, acl_output_tensor,
|
||||
&workspaceSize, &executor));
|
||||
if (workspaceAddr == nullptr) {
|
||||
workspaceAddr = workspace_allocator.alloc(workspaceSize);
|
||||
}
|
||||
@ -2852,7 +2833,7 @@ static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
|
||||
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2GetWorkspaceSize(
|
||||
acl_input_tensor, acl_weight_tensor, acl_scale_tensor,
|
||||
nullptr, nullptr, nullptr, nullptr, antiquantGroupSize,
|
||||
nullptr, nullptr, nullptr, nullptr, QK8_0,
|
||||
acl_output_tensor, &workspaceSize, &executor));
|
||||
ACL_CHECK(aclnnWeightQuantBatchMatmulV2(
|
||||
workspaceAddr, workspaceSize, executor, ctx.stream()));
|
||||
|
@ -1689,6 +1689,11 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
case GGML_OP_MUL_MAT: {
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_Q8_0:
|
||||
// Current groupsize should not be greater than k-1 in
|
||||
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize
|
||||
if (op->src[0]->ne[0] <= QK8_0) {
|
||||
return false;
|
||||
}
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_Q4_0:
|
||||
@ -1704,6 +1709,7 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
switch (op->src[0]->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q4_0:
|
||||
case GGML_TYPE_Q8_0:
|
||||
return true;
|
||||
default:
|
||||
@ -1711,21 +1717,16 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
} break;
|
||||
case GGML_OP_CPY: {
|
||||
ggml_tensor *src = op->src[0];
|
||||
if ((op->type != GGML_TYPE_F32 && op->type != GGML_TYPE_F16) ||
|
||||
(src->type != GGML_TYPE_F32 &&
|
||||
src->type != GGML_TYPE_F16)) {
|
||||
// only support F32 and F16.
|
||||
return false;
|
||||
switch (op->type) {
|
||||
case GGML_TYPE_F32:
|
||||
case GGML_TYPE_F16:
|
||||
case GGML_TYPE_Q8_0:
|
||||
case GGML_TYPE_Q4_0:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
|
||||
// unsupport dst is not contiguous.
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} break;
|
||||
}
|
||||
case GGML_OP_CONT: {
|
||||
// TODO: support GGML_TYPE_BF16
|
||||
switch (op->src[0]->type) {
|
||||
@ -1766,9 +1767,9 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
}
|
||||
return true;
|
||||
}
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_IM2COL:
|
||||
case GGML_OP_CONCAT:
|
||||
case GGML_OP_DUP:
|
||||
case GGML_OP_REPEAT:
|
||||
case GGML_OP_NONE:
|
||||
case GGML_OP_RESHAPE:
|
||||
|
@ -158,12 +158,6 @@ typedef sycl::half2 ggml_half2;
|
||||
|
||||
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define GGML_EXTENSION
|
||||
#else // _MSC_VER
|
||||
#define GGML_EXTENSION __extension__
|
||||
#endif // _MSC_VER
|
||||
|
||||
#define QK4_0 32
|
||||
typedef struct {
|
||||
ggml_half d; // delta
|
||||
@ -173,7 +167,7 @@ static_assert(sizeof(block_q4_0) == sizeof(ggml_half) + QK4_0 / 2, "wrong q4_0 b
|
||||
|
||||
#define QK4_1 32
|
||||
typedef struct {
|
||||
GGML_EXTENSION union {
|
||||
union {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
@ -194,7 +188,7 @@ static_assert(sizeof(block_q5_0) == sizeof(ggml_half) + sizeof(uint32_t) + QK5_0
|
||||
|
||||
#define QK5_1 32
|
||||
typedef struct {
|
||||
GGML_EXTENSION union {
|
||||
union {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half m; // min
|
||||
@ -215,7 +209,7 @@ static_assert(sizeof(block_q8_0) == sizeof(ggml_half) + QK8_0, "wrong q8_0 block
|
||||
|
||||
#define QK8_1 32
|
||||
typedef struct {
|
||||
GGML_EXTENSION union {
|
||||
union {
|
||||
struct {
|
||||
ggml_half d; // delta
|
||||
ggml_half s; // d * sum(qs[i])
|
||||
@ -256,7 +250,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(ggml_half) + QK_K / 4, "wrong tq2_0
|
||||
typedef struct {
|
||||
uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
|
||||
uint8_t qs[QK_K/4]; // quants
|
||||
GGML_EXTENSION union {
|
||||
union {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
@ -283,7 +277,7 @@ static_assert(sizeof(block_q3_K) == sizeof(ggml_half) + QK_K / 4 + QK_K / 8 + 12
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 4.5 bits per weight
|
||||
typedef struct {
|
||||
GGML_EXTENSION union {
|
||||
union {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
@ -300,7 +294,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(ggml_half) + K_SCALE_SIZE + QK_K/2,
|
||||
// weight is represented as x = a * q + b
|
||||
// Effectively 5.5 bits per weight
|
||||
typedef struct {
|
||||
GGML_EXTENSION union {
|
||||
union {
|
||||
struct {
|
||||
ggml_half d; // super-block scale for quantized scales
|
||||
ggml_half dmin; // super-block scale for quantized mins
|
||||
|
@ -23,16 +23,6 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
ggml-cpu/amx/mmq.cpp
|
||||
ggml-cpu/amx/mmq.h
|
||||
ggml-cpu/ggml-cpu-impl.h
|
||||
ggml-cpu/common.h
|
||||
ggml-cpu/binary-ops.h
|
||||
ggml-cpu/binary-ops.cpp
|
||||
ggml-cpu/unary-ops.h
|
||||
ggml-cpu/unary-ops.cpp
|
||||
ggml-cpu/simd-mappings.h
|
||||
ggml-cpu/vec.h
|
||||
ggml-cpu/vec.cpp
|
||||
ggml-cpu/ops.h
|
||||
ggml-cpu/ops.cpp
|
||||
)
|
||||
|
||||
target_compile_features(${GGML_CPU_NAME} PRIVATE c_std_11 cxx_std_17)
|
||||
@ -297,31 +287,17 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
elseif ("${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "ppc64le " OR "${CMAKE_SYSTEM_PROCESSOR} " STREQUAL "powerpc ")
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
message(STATUS "PowerPC detected")
|
||||
if (GGML_NATIVE)
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64")
|
||||
file(READ "/proc/cpuinfo" POWER10_M)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
|
||||
execute_process(COMMAND bash -c "prtconf |grep 'Implementation' | head -n 1" OUTPUT_VARIABLE POWER10_M)
|
||||
endif()
|
||||
|
||||
string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M}")
|
||||
string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
|
||||
|
||||
if (EXTRACTED_NUMBER GREATER_EQUAL 10)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10 -mpowerpc64)
|
||||
elseif (EXTRACTED_NUMBER EQUAL 9)
|
||||
list(APPEND ARCH_FLAGS -mcpu=power9 -mpowerpc64)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -mcpu=native -mtune=native -mpowerpc64)
|
||||
endif()
|
||||
execute_process(COMMAND bash -c "grep POWER /proc/cpuinfo | head -n 1" OUTPUT_VARIABLE POWER_M)
|
||||
if (${POWER_M} MATCHES "POWER10")
|
||||
list(APPEND ARCH_FLAGS -mcpu=power10)
|
||||
elseif (${POWER_M} MATCHES "POWER9")
|
||||
list(APPEND ARCH_FLAGS -mcpu=power9)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64le -mtune=native)
|
||||
else()
|
||||
if (GGML_CPU_POWERPC_CPUTYPE)
|
||||
list(APPEND ARCH_FLAGS -mcpu=${GGML_CPU_POWERPC_CPUTYPE})
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS -mcpu=powerpc64 -mtune=native)
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
|
||||
message(STATUS "loongarch64 detected")
|
||||
@ -336,11 +312,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
|
||||
message(STATUS "RISC-V detected")
|
||||
if (GGML_RVV)
|
||||
if (GGML_RV_ZFH)
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv_zfhmin -DGGML_RV_ZFH -mabi=lp64d)
|
||||
else()
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
||||
endif()
|
||||
list(APPEND ARCH_FLAGS -march=rv64gcv -mabi=lp64d)
|
||||
endif()
|
||||
elseif (${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
|
||||
message(STATUS "s390x detected")
|
||||
@ -379,9 +351,9 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
|
||||
|
||||
# Fetch KleidiAI sources:
|
||||
include(FetchContent)
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.5.0")
|
||||
set(KLEIDIAI_COMMIT_TAG "v1.3.0")
|
||||
set(KLEIDIAI_DOWNLOAD_URL "https://github.com/ARM-software/kleidiai/archive/refs/tags/${KLEIDIAI_COMMIT_TAG}.tar.gz")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "ea22e1aefb800e9bc8c74d91633cc58e")
|
||||
set(KLEIDIAI_ARCHIVE_MD5 "060bd2dc64642b091f461cc8dd7426d9")
|
||||
|
||||
if (POLICY CMP0135)
|
||||
cmake_policy(SET CMP0135 NEW)
|
||||
|
@ -1,158 +0,0 @@
|
||||
#include "binary-ops.h"
|
||||
|
||||
#if defined(GGML_USE_ACCELERATE)
|
||||
#include <Accelerate/Accelerate.h>
|
||||
|
||||
using vDSP_fn_t = void (*)(const float *, vDSP_Stride, const float *, vDSP_Stride, float *, vDSP_Stride, vDSP_Length);
|
||||
#endif
|
||||
|
||||
static inline float op_add(float a, float b) {
|
||||
return a + b;
|
||||
}
|
||||
|
||||
static inline float op_sub(float a, float b) {
|
||||
return a - b;
|
||||
}
|
||||
|
||||
static inline float op_mul(float a, float b) {
|
||||
return a * b;
|
||||
}
|
||||
|
||||
static inline float op_div(float a, float b) {
|
||||
return a / b;
|
||||
}
|
||||
|
||||
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static inline void vec_binary_op_contiguous(const int64_t n, dst_t * z, const src0_t * x, const src1_t * y) {
|
||||
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
||||
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
|
||||
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(y[i])));
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static inline void vec_binary_op_non_contiguous(const int64_t n, const int64_t ne10, const int64_t nb10, dst_t * z, const src0_t * x, const src1_t * y) {
|
||||
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
||||
constexpr auto src1_to_f32 = type_conversion_table<src1_t>::to_f32;
|
||||
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
int i10 = i % ne10;
|
||||
const src1_t * y_ptr = (const src1_t *)((const char *)y + i10*nb10);
|
||||
z[i] = f32_to_dst(op(src0_to_f32(x[i]), src1_to_f32(*y_ptr)));
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*op)(float, float), typename src0_t, typename src1_t, typename dst_t>
|
||||
static void apply_binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
GGML_ASSERT(ggml_can_repeat(src1, src0) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(dst_t));
|
||||
GGML_ASSERT(nb00 == sizeof(src0_t));
|
||||
|
||||
const auto [ir0, ir1] = get_thread_range(params, src0);
|
||||
const bool is_src1_contiguous = (nb10 == sizeof(src1_t));
|
||||
|
||||
if (!is_src1_contiguous) { // broadcast not implemented yet for non-contiguous
|
||||
GGML_ASSERT(ggml_are_same_shape(src0, src1));
|
||||
}
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
vDSP_fn_t vDSP_op = nullptr;
|
||||
// TODO - avoid the f32-only check using type 'trait' lookup tables and row-based src-to-float conversion functions
|
||||
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
if (op == op_add) {
|
||||
vDSP_op = vDSP_vadd;
|
||||
} else if (op == op_sub) {
|
||||
vDSP_op = vDSP_vsub;
|
||||
} else if (op == op_mul) {
|
||||
vDSP_op = vDSP_vmul;
|
||||
} else if (op == op_div) {
|
||||
vDSP_op = vDSP_vdiv;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
const int64_t i13 = i03 % ne13;
|
||||
const int64_t i12 = i02 % ne12;
|
||||
const int64_t i11 = i01 % ne11;
|
||||
|
||||
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
const src1_t * src1_ptr = (const src1_t *) ((const char *) src1->data + i13*nb13 + i12*nb12 + i11*nb11);
|
||||
|
||||
if (is_src1_contiguous) {
|
||||
// src1 is broadcastable across src0 and dst in i1, i2, i3
|
||||
const int64_t nr0 = ne00 / ne10;
|
||||
|
||||
for (int64_t r = 0; r < nr0; ++r) {
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
if constexpr (std::is_same_v<src0_t, float> && std::is_same_v<src1_t, float> && std::is_same_v<dst_t, float>) {
|
||||
if (vDSP_op != nullptr) {
|
||||
vDSP_op(src1_ptr, 1, src0_ptr + r*ne10, 1, dst_ptr + r*ne10, 1, ne10);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
vec_binary_op_contiguous<op>(ne10, dst_ptr + r*ne10, src0_ptr + r*ne10, src1_ptr);
|
||||
}
|
||||
} else {
|
||||
vec_binary_op_non_contiguous<op>(ne0, ne10, nb10, dst_ptr, src0_ptr, src1_ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
|
||||
template <float (*op)(float, float)>
|
||||
static void binary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const ggml_tensor * src1 = dst->src[1];
|
||||
|
||||
/* */ if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
||||
apply_binary_op<op, float, float, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
||||
apply_binary_op<op, ggml_fp16_t, ggml_fp16_t, ggml_fp16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
||||
apply_binary_op<op, ggml_bf16_t, ggml_bf16_t, ggml_bf16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_BF16) {
|
||||
apply_binary_op<op, ggml_bf16_t, float, ggml_bf16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
apply_binary_op<op, ggml_bf16_t, float, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F16) {
|
||||
apply_binary_op<op, ggml_fp16_t, float, ggml_fp16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
|
||||
apply_binary_op<op, ggml_fp16_t, float, float>(params, dst);
|
||||
} else {
|
||||
GGML_ABORT("%s: unsupported types: dst: %s, src0: %s, src1: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_add_non_quantized(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_add>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sub(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_sub>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_mul(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_mul>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_div(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
binary_op<op_div>(params, dst);
|
||||
}
|
@ -1,16 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_compute_forward_add_non_quantized(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sub(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mul(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_div(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -1,72 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
#include "ggml-cpu-traits.h"
|
||||
#include "ggml-cpu-impl.h"
|
||||
#include "ggml-impl.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
#include <utility>
|
||||
|
||||
// convenience functions/macros for use in template calls
|
||||
// note: these won't be required after the 'traits' lookup table is used.
|
||||
static inline ggml_fp16_t f32_to_f16(float x) {
|
||||
return GGML_FP32_TO_FP16(x);
|
||||
}
|
||||
|
||||
static inline float f16_to_f32(ggml_fp16_t x) {
|
||||
return GGML_FP16_TO_FP32(x);
|
||||
}
|
||||
|
||||
static inline ggml_bf16_t f32_to_bf16(float x) {
|
||||
return GGML_FP32_TO_BF16(x);
|
||||
}
|
||||
|
||||
static inline float bf16_to_f32(ggml_bf16_t x) {
|
||||
return GGML_BF16_TO_FP32(x);
|
||||
}
|
||||
|
||||
static inline float f32_to_f32(float x) {
|
||||
return x;
|
||||
}
|
||||
|
||||
// TODO - merge this into the traits table, after using row-based conversions
|
||||
template <class T>
|
||||
struct type_conversion_table;
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<ggml_fp16_t> {
|
||||
static constexpr float (*to_f32)(ggml_fp16_t) = f16_to_f32;
|
||||
static constexpr ggml_fp16_t (*from_f32)(float) = f32_to_f16;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<float> {
|
||||
static constexpr float (*to_f32)(float) = f32_to_f32;
|
||||
static constexpr float (*from_f32)(float) = f32_to_f32;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct type_conversion_table<ggml_bf16_t> {
|
||||
static constexpr float (*to_f32)(ggml_bf16_t) = bf16_to_f32;
|
||||
static constexpr ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
|
||||
};
|
||||
|
||||
static std::pair<int64_t, int64_t> get_thread_range(const struct ggml_compute_params * params, const struct ggml_tensor * src0) {
|
||||
const int64_t ith = params->ith;
|
||||
const int64_t nth = params->nth;
|
||||
|
||||
const int64_t nr = ggml_nrows(src0);
|
||||
|
||||
// rows per thread
|
||||
const int64_t dr = (nr + nth - 1)/nth;
|
||||
|
||||
// row range for this thread
|
||||
const int64_t ir0 = dr*ith;
|
||||
const int64_t ir1 = MIN(ir0 + dr, nr);
|
||||
|
||||
return {ir0, ir1};
|
||||
}
|
||||
|
||||
#endif
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -51,10 +51,11 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .run_kernel = */ kai_run_matmul_clamp_f32_qsi8d32p1x4_qsi4c32p4vlx4_1x4vl_sme2_sdot,
|
||||
},
|
||||
/* .lhs_info = */ {
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32_neon,
|
||||
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32_neon,
|
||||
/* .get_offset = */ kai_get_lhs_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32_neon,
|
||||
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32_neon,
|
||||
/* .require_aligned_m_idx = */ true,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32ps1s0scalef16_qsu4c32s16s0_neon,
|
||||
@ -99,6 +100,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .require_aligned_m_idx = */ false,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
@ -142,6 +144,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .require_aligned_m_idx = */ false,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
@ -186,6 +189,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .require_aligned_m_idx = */ false,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
@ -229,6 +233,7 @@ static ggml_kleidiai_kernels gemm_gemv_kernels[] = {
|
||||
/* .get_packed_offset = */ kai_get_lhs_packed_offset_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .packed_size = */ kai_get_lhs_packed_size_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .pack_func = */ kai_run_lhs_quant_pack_qsi8d32p_f32,
|
||||
/* .require_aligned_m_idx = */ false,
|
||||
},
|
||||
/* .rhs_info = */ {
|
||||
/* .packed_size = */ kai_get_rhs_packed_size_rhs_pack_nxk_qsi4c32pscalef16_qsu4c32s16s0,
|
||||
|
@ -40,6 +40,7 @@ struct lhs_packing_info {
|
||||
size_t (*packed_size)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr);
|
||||
void (*pack_func)(size_t m, size_t k, size_t bl, size_t mr, size_t kr, size_t sr, size_t m_idx_start, const float* lhs,
|
||||
size_t lhs_stride, void* lhs_packed);
|
||||
bool require_aligned_m_idx;
|
||||
};
|
||||
|
||||
struct rhs_packing_info {
|
||||
|
@ -124,7 +124,8 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
size_t sr = kernel->get_sr();
|
||||
|
||||
// Calculate number of columns to be processed per thread
|
||||
const size_t num_m_per_thread = kai_roundup(m, mr * nth) / nth;
|
||||
const bool use_multithread = lhs_info->require_aligned_m_idx && m <= mr ? false : true;
|
||||
const size_t num_m_per_thread = use_multithread ? kai_roundup(m, nth) / nth : m;
|
||||
const size_t m_start = ith * num_m_per_thread;
|
||||
size_t m_to_process = num_m_per_thread;
|
||||
if ((m_start + m_to_process) > m) {
|
||||
@ -134,11 +135,11 @@ class tensor_traits : public ggml::cpu::tensor_traits {
|
||||
if(m_start < m) {
|
||||
// Transform LHS
|
||||
const size_t src_stride = src1->nb[1];
|
||||
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(m_start, dst->src[1]->nb[1]));
|
||||
const float * src_ptr = reinterpret_cast<const float *>(lhs + lhs_info->get_offset(0, dst->src[1]->nb[1]));
|
||||
const size_t lhs_packed_offset = lhs_info->get_packed_offset(m_start, k, QK4_0, mr, kr, sr);
|
||||
void * lhs_packed_ptr = static_cast<void *>(lhs_packed + lhs_packed_offset);
|
||||
|
||||
lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, 0, src_ptr, src_stride, lhs_packed_ptr);
|
||||
lhs_info->pack_func(m_to_process, k, QK4_0, mr, kr, sr, m_start, src_ptr, src_stride, lhs_packed_ptr);
|
||||
}
|
||||
|
||||
ggml_barrier(params->threadpool);
|
||||
|
@ -55,7 +55,6 @@
|
||||
|
||||
#include <atomic>
|
||||
#include <array>
|
||||
#include <type_traits>
|
||||
|
||||
#ifdef _MSC_VER
|
||||
#define NOINLINE __declspec(noinline)
|
||||
@ -1093,403 +1092,13 @@ class tinyBLAS_Q0_PPC {
|
||||
}
|
||||
}
|
||||
|
||||
template<typename VA, typename VB, int size>
|
||||
void packNormalInt4(const TA* a, int64_t lda, int rows, int cols, VA* vec, std::array<int, size>& comparray) {
|
||||
template<typename VA, typename VB>
|
||||
void packNormal(const TA* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
||||
int64_t i, j;
|
||||
TA *aoffset = NULL;
|
||||
VA *vecOffset = NULL;
|
||||
TA *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
TA *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2] = {0};
|
||||
VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2] = {0};
|
||||
VB t1, t2, t3, t4, t5, t6, t7, t8;
|
||||
const vector signed char lowMask = vec_splats((signed char)0xF);
|
||||
const vector unsigned char v4 = vec_splats((unsigned char)0x4);
|
||||
const vector signed char v8 = vec_splats((signed char)0x8);
|
||||
aoffset = const_cast<TA*>(a);
|
||||
vecOffset = vec;
|
||||
vector unsigned char swiz1 = {0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23};
|
||||
vector unsigned char swiz2 = {8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31};
|
||||
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
||||
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||
vector signed int vsum = {0};
|
||||
vector signed int vsum2 = {0};
|
||||
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
do {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
i = (cols >> 2);
|
||||
if (i > 0) {
|
||||
do {
|
||||
c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
|
||||
c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
|
||||
c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
|
||||
c4[1] = reinterpret_cast<VB>(vec_xl(0, aoffset4->qs));
|
||||
c5[1] = reinterpret_cast<VB>(vec_xl(0, aoffset5->qs));
|
||||
c6[1] = reinterpret_cast<VB>(vec_xl(0, aoffset6->qs));
|
||||
c7[1] = reinterpret_cast<VB>(vec_xl(0, aoffset7->qs));
|
||||
c8[1] = reinterpret_cast<VB>(vec_xl(0, aoffset8->qs));
|
||||
|
||||
c1[0] = vec_and(c1[1], lowMask);
|
||||
c1[1] = vec_sr(c1[1], v4);
|
||||
c1[0] = vec_sub(c1[0], v8);
|
||||
c1[1] = vec_sub(c1[1], v8);
|
||||
vsum = vec_sum4s(c1[0], vsum);
|
||||
vsum2 = vec_sum4s(c1[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c2[0] = vec_and(c2[1], lowMask);
|
||||
c2[1] = vec_sr(c2[1], v4);
|
||||
c2[0] = vec_sub(c2[0], v8);
|
||||
c2[1] = vec_sub(c2[1], v8);
|
||||
vsum = vec_sum4s(c2[0], vsum);
|
||||
vsum2 = vec_sum4s(c2[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c3[0] = vec_and(c3[1], lowMask);
|
||||
c3[1] = vec_sr(c3[1], v4);
|
||||
c3[0] = vec_sub(c3[0], v8);
|
||||
c3[1] = vec_sub(c3[1], v8);
|
||||
vsum = vec_sum4s(c3[0], vsum);
|
||||
vsum2 = vec_sum4s(c3[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c4[0] = vec_and(c4[1], lowMask);
|
||||
c4[1] = vec_sr(c4[1], v4);
|
||||
c4[0] = vec_sub(c4[0], v8);
|
||||
c4[1] = vec_sub(c4[1], v8);
|
||||
vsum = vec_sum4s(c4[0], vsum);
|
||||
vsum2 = vec_sum4s(c4[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c5[0] = vec_and(c5[1], lowMask);
|
||||
c5[1] = vec_sr(c5[1], v4);
|
||||
c5[0] = vec_sub(c5[0], v8);
|
||||
c5[1] = vec_sub(c5[1], v8);
|
||||
vsum = vec_sum4s(c5[0], vsum);
|
||||
vsum2 = vec_sum4s(c5[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[4] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c6[0] = vec_and(c6[1], lowMask);
|
||||
c6[1] = vec_sr(c6[1], v4);
|
||||
c6[0] = vec_sub(c6[0], v8);
|
||||
c6[1] = vec_sub(c6[1], v8);
|
||||
vsum = vec_sum4s(c6[0], vsum);
|
||||
vsum2 = vec_sum4s(c6[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[5] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c7[0] = vec_and(c7[1], lowMask);
|
||||
c7[1] = vec_sr(c7[1], v4);
|
||||
c7[0] = vec_sub(c7[0], v8);
|
||||
c7[1] = vec_sub(c7[1], v8);
|
||||
vsum = vec_sum4s(c7[0], vsum);
|
||||
vsum2 = vec_sum4s(c7[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[6] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c8[0] = vec_and(c8[1], lowMask);
|
||||
c8[1] = vec_sr(c8[1], v4);
|
||||
c8[0] = vec_sub(c8[0], v8);
|
||||
c8[1] = vec_sub(c8[1], v8);
|
||||
vsum = vec_sum4s(c8[0], vsum);
|
||||
vsum2 = vec_sum4s(c8[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[7] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
|
||||
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
vec_xst(t5, 0, vecOffset+64);
|
||||
vec_xst(t6, 0, vecOffset+80);
|
||||
vec_xst(t7, 0, vecOffset+96);
|
||||
vec_xst(t8, 0, vecOffset+112);
|
||||
|
||||
t1 = vec_perm(c5[0], c6[0], swiz1);
|
||||
t2 = vec_perm(c5[0], c6[0], swiz2);
|
||||
t3 = vec_perm(c7[0], c8[0], swiz1);
|
||||
t4 = vec_perm(c7[0], c8[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
vec_xst(t5, 0, vecOffset+128);
|
||||
vec_xst(t6, 0, vecOffset+144);
|
||||
vec_xst(t7, 0, vecOffset+160);
|
||||
vec_xst(t8, 0, vecOffset+176);
|
||||
|
||||
t1 = vec_perm(c5[1], c6[1], swiz1);
|
||||
t2 = vec_perm(c5[1], c6[1], swiz2);
|
||||
t3 = vec_perm(c7[1], c8[1], swiz1);
|
||||
t4 = vec_perm(c7[1], c8[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
vec_xst(t5, 0, vecOffset+192);
|
||||
vec_xst(t6, 0, vecOffset+208);
|
||||
vec_xst(t7, 0, vecOffset+224);
|
||||
vec_xst(t8, 0, vecOffset+240);
|
||||
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
aoffset4 += lda;
|
||||
aoffset5 += lda;
|
||||
aoffset6 += lda;
|
||||
aoffset7 += lda;
|
||||
aoffset8 += lda;
|
||||
vecOffset += 256;
|
||||
i--;
|
||||
} while (i > 0);
|
||||
}
|
||||
j--;
|
||||
} while (j > 0);
|
||||
}
|
||||
|
||||
if (rows & 4) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
i = (cols >> 2);
|
||||
if (i > 0) {
|
||||
do {
|
||||
c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
|
||||
c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
|
||||
c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
|
||||
c4[1] = reinterpret_cast<VB>(vec_xl(0, aoffset4->qs));
|
||||
|
||||
c1[0] = vec_and(c1[1], lowMask);
|
||||
c1[1] = vec_sr(c1[1], v4);
|
||||
c1[0] = vec_sub(c1[0], v8);
|
||||
c1[1] = vec_sub(c1[1], v8);
|
||||
vsum = vec_sum4s(c1[0], vsum);
|
||||
vsum2 = vec_sum4s(c1[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c2[0] = vec_and(c2[1], lowMask);
|
||||
c2[1] = vec_sr(c2[1], v4);
|
||||
c2[0] = vec_sub(c2[0], v8);
|
||||
c2[1] = vec_sub(c2[1], v8);
|
||||
vsum = vec_sum4s(c2[0], vsum);
|
||||
vsum2 = vec_sum4s(c2[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c3[0] = vec_and(c3[1], lowMask);
|
||||
c3[1] = vec_sr(c3[1], v4);
|
||||
c3[0] = vec_sub(c3[0], v8);
|
||||
c3[1] = vec_sub(c3[1], v8);
|
||||
vsum = vec_sum4s(c3[0], vsum);
|
||||
vsum2 = vec_sum4s(c3[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c4[0] = vec_and(c4[1], lowMask);
|
||||
c4[1] = vec_sr(c4[1], v4);
|
||||
c4[0] = vec_sub(c4[0], v8);
|
||||
c4[1] = vec_sub(c4[1], v8);
|
||||
vsum = vec_sum4s(c4[0], vsum);
|
||||
vsum2 = vec_sum4s(c4[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats( 0);
|
||||
|
||||
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
|
||||
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
vec_xst(t5, 0, vecOffset+64);
|
||||
vec_xst(t6, 0, vecOffset+80);
|
||||
vec_xst(t7, 0, vecOffset+96);
|
||||
vec_xst(t8, 0, vecOffset+112);
|
||||
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
aoffset4 += lda;
|
||||
vecOffset += 128;
|
||||
i--;
|
||||
} while (i > 0);
|
||||
}
|
||||
}
|
||||
|
||||
if (rows & 3) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
i = (cols >> 2);
|
||||
if (i > 0) {
|
||||
do {
|
||||
switch(rows) {
|
||||
case 3: c3[1] = reinterpret_cast<VB>(vec_xl(0, aoffset3->qs));
|
||||
case 2: c2[1] = reinterpret_cast<VB>(vec_xl(0, aoffset2->qs));
|
||||
case 1: c1[1] = reinterpret_cast<VB>(vec_xl(0, aoffset1->qs));
|
||||
break;
|
||||
}
|
||||
c1[0] = vec_and(c1[1], lowMask);
|
||||
c1[1] = vec_sr(c1[1], v4);
|
||||
c1[0] = vec_sub(c1[0], v8);
|
||||
c1[1] = vec_sub(c1[1], v8);
|
||||
vsum = vec_sum4s(c1[0], vsum);
|
||||
vsum2 = vec_sum4s(c1[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[0] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c2[0] = vec_and(c2[1], lowMask);
|
||||
c2[1] = vec_sr(c2[1], v4);
|
||||
c2[0] = vec_sub(c2[0], v8);
|
||||
c2[1] = vec_sub(c2[1], v8);
|
||||
vsum = vec_sum4s(c2[0], vsum);
|
||||
vsum2 = vec_sum4s(c2[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[1] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c3[0] = vec_and(c3[1], lowMask);
|
||||
c3[1] = vec_sr(c3[1], v4);
|
||||
c3[0] = vec_sub(c3[0], v8);
|
||||
c3[1] = vec_sub(c3[1], v8);
|
||||
vsum = vec_sum4s(c3[0], vsum);
|
||||
vsum2 = vec_sum4s(c3[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[2] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
c4[0] = vec_and(c4[1], lowMask);
|
||||
c4[1] = vec_sr(c4[1], v4);
|
||||
c4[0] = vec_sub(c4[0], v8);
|
||||
c4[1] = vec_sub(c4[1], v8);
|
||||
vsum = vec_sum4s(c4[0], vsum);
|
||||
vsum2 = vec_sum4s(c4[1], vsum2);
|
||||
vsum = vec_add(vsum, vsum2);
|
||||
comparray[3] = vsum[0] + vsum[1] + vsum[2] + vsum[3];
|
||||
vsum = vec_splats(0);
|
||||
vsum2 = vec_splats(0);
|
||||
|
||||
t1 = vec_perm(c1[0], c2[0], swiz1);
|
||||
t2 = vec_perm(c1[0], c2[0], swiz2);
|
||||
t3 = vec_perm(c3[0], c4[0], swiz1);
|
||||
t4 = vec_perm(c3[0], c4[0], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
vec_xst(t7, 0, vecOffset+32);
|
||||
vec_xst(t8, 0, vecOffset+48);
|
||||
|
||||
t1 = vec_perm(c1[1], c2[1], swiz1);
|
||||
t2 = vec_perm(c1[1], c2[1], swiz2);
|
||||
t3 = vec_perm(c3[1], c4[1], swiz1);
|
||||
t4 = vec_perm(c3[1], c4[1], swiz2);
|
||||
t5 = vec_perm(t1, t3, swiz3);
|
||||
t6 = vec_perm(t1, t3, swiz4);
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
vec_xst(t5, 0, vecOffset+64);
|
||||
vec_xst(t6, 0, vecOffset+80);
|
||||
vec_xst(t7, 0, vecOffset+96);
|
||||
vec_xst(t8, 0, vecOffset+112);
|
||||
aoffset1 += lda;
|
||||
aoffset2 += lda;
|
||||
aoffset3 += lda;
|
||||
vecOffset += 128;
|
||||
i--;
|
||||
} while(i > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename VA, typename VB>
|
||||
void packNormal(const TB* a, int64_t lda, int rows, int cols, VA* vec, bool flip) {
|
||||
int64_t i, j;
|
||||
TB *aoffset = NULL;
|
||||
VA *vecOffset = NULL;
|
||||
TB *aoffset1 = NULL, *aoffset2 = NULL, *aoffset3 = NULL, *aoffset4 = NULL;
|
||||
TB *aoffset5 = NULL, *aoffset6 = NULL, *aoffset7 = NULL, *aoffset8 = NULL;
|
||||
__vector_pair C1, C2, C3, C4, C5, C6, C7, C8;
|
||||
VB c1[2] = {0}, c2[2] = {0}, c3[2] = {0}, c4[2]={0};
|
||||
VB c5[2] = {0}, c6[2] = {0}, c7[2] = {0}, c8[2]={0};
|
||||
@ -1502,24 +1111,24 @@ class tinyBLAS_Q0_PPC {
|
||||
vector unsigned char swiz3 = {0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27};
|
||||
vector unsigned char swiz4 = {4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31};
|
||||
|
||||
aoffset = const_cast<TB*>(a);
|
||||
aoffset = const_cast<TA*>(a);
|
||||
vecOffset = vec;
|
||||
j = (rows >> 3);
|
||||
if (j > 0) {
|
||||
do {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset5 = aoffset4 + lda;
|
||||
aoffset6 = aoffset5 + lda;
|
||||
aoffset7 = aoffset6 + lda;
|
||||
aoffset8 = aoffset7 + lda;
|
||||
aoffset += 8 * lda;
|
||||
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
do {
|
||||
C1 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset1->qs);
|
||||
C2 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset2->qs);
|
||||
C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||
@ -1547,10 +1156,10 @@ class tinyBLAS_Q0_PPC {
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset);
|
||||
vec_xst(t6, 0, vecOffset+16);
|
||||
@ -1566,10 +1175,10 @@ class tinyBLAS_Q0_PPC {
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+64);
|
||||
vec_xst(t6, 0, vecOffset+80);
|
||||
@ -1585,10 +1194,10 @@ class tinyBLAS_Q0_PPC {
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+128);
|
||||
vec_xst(t6, 0, vecOffset+144);
|
||||
@ -1604,10 +1213,10 @@ class tinyBLAS_Q0_PPC {
|
||||
t7 = vec_perm(t2, t4, swiz3);
|
||||
t8 = vec_perm(t2, t4, swiz4);
|
||||
if (flip == true) {
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
t5 = vec_xor(t5, xor_vector);
|
||||
t6 = vec_xor(t6, xor_vector);
|
||||
t7 = vec_xor(t7, xor_vector);
|
||||
t8 = vec_xor(t8, xor_vector);
|
||||
}
|
||||
vec_xst(t5, 0, vecOffset+192);
|
||||
vec_xst(t6, 0, vecOffset+208);
|
||||
@ -1631,11 +1240,11 @@ class tinyBLAS_Q0_PPC {
|
||||
}
|
||||
|
||||
if (rows & 4) {
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset += 4 * lda;
|
||||
aoffset1 = aoffset;
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
aoffset4 = aoffset3 + lda;
|
||||
aoffset += 4 * lda;
|
||||
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
@ -1702,7 +1311,7 @@ class tinyBLAS_Q0_PPC {
|
||||
aoffset2 = aoffset1 + lda;
|
||||
aoffset3 = aoffset2 + lda;
|
||||
i = (cols >> 3);
|
||||
if (i > 0) {
|
||||
if (i > 0) {
|
||||
do {
|
||||
switch(rows) {
|
||||
case 3: C3 = __builtin_vsx_lxvp(0, (__vector_pair*)aoffset3->qs);
|
||||
@ -1918,18 +1527,13 @@ class tinyBLAS_Q0_PPC {
|
||||
void KERNEL_4x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[8], vec_B[16] = {0};
|
||||
acc_t acc_0, acc_1;
|
||||
std::array<int, 4> comparray {};
|
||||
std::array<int, 4> comparray;
|
||||
vector float fin_res[8] = {0};
|
||||
vector float vs[8] = {0};
|
||||
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
if (std::is_same_v<TA, block_q4_0>) {
|
||||
packNormalInt4<int8_t, vector signed char, 4>((A+(ii*lda)+l), lda, 4, 4, (int8_t*)vec_A, comparray);
|
||||
} else {
|
||||
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
|
||||
}
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 4, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
@ -1941,17 +1545,15 @@ class tinyBLAS_Q0_PPC {
|
||||
*((float*)&vs[I+4]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
if (!isAblock_q4) {
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
auto *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
compute<4>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute<4>(&acc_1, 0, 4, comparray, vs, fin_res);
|
||||
@ -1963,18 +1565,13 @@ class tinyBLAS_Q0_PPC {
|
||||
void KERNEL_8x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[16], vec_B[8] = {0};
|
||||
acc_t acc_0, acc_1;
|
||||
std::array<int, 8> comparray {};
|
||||
std::array<int, 8> comparray;
|
||||
vector float fin_res[8] = {0};
|
||||
vector float vs[8] = {0};
|
||||
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
if (std::is_same_v<TA, block_q4_0>) {
|
||||
packNormalInt4<int8_t, vector signed char, 8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
|
||||
} else {
|
||||
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||
}
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 4, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
@ -1985,17 +1582,15 @@ class tinyBLAS_Q0_PPC {
|
||||
*((float*)&vs[I]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
if (!isAblock_q4) {
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
auto *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||
@ -2007,20 +1602,15 @@ class tinyBLAS_Q0_PPC {
|
||||
void KERNEL_8x8(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[16], vec_B[16] = {0};
|
||||
acc_t acc_0, acc_1, acc_2, acc_3;
|
||||
std::array<int, 8> comparray {};
|
||||
std::array<int, 8> comparray;
|
||||
vector float fin_res[16] = {0};
|
||||
vector float vs[16] = {0};
|
||||
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||
for (int l = 0; l < k; l++) {
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
__builtin_mma_xxsetaccz(&acc_1);
|
||||
__builtin_mma_xxsetaccz(&acc_2);
|
||||
__builtin_mma_xxsetaccz(&acc_3);
|
||||
if (std::is_same_v<TA, block_q4_0>) {
|
||||
packNormalInt4<int8_t, vector signed char, 8>((A+(ii*lda)+l), lda, 8, 4, (int8_t*)vec_A, comparray);
|
||||
} else {
|
||||
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||
}
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, 8, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, 8, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x++) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
@ -2034,17 +1624,15 @@ class tinyBLAS_Q0_PPC {
|
||||
*((float*)&vs[I+8]+J) = (unhalf((A+((ii+I)*lda)+l)->d) * unhalf((B+((jj+J+4)*ldb)+l)->d));
|
||||
}
|
||||
}
|
||||
if (!isAblock_q4) {
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
auto *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < 8; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
compute<8>(&acc_0, 0, 0, comparray, vs, fin_res);
|
||||
compute<8>(&acc_1, 4, 4, comparray, vs, fin_res);
|
||||
@ -2065,17 +1653,16 @@ class tinyBLAS_Q0_PPC {
|
||||
int64_t duty = (tiles + nth - 1) / nth;
|
||||
int64_t start = duty * ith;
|
||||
int64_t end = start + duty;
|
||||
vec_t vec_A[8] = {0}, vec_B[8] = {0};
|
||||
vec_t vec_A[8], vec_B[8] = {0};
|
||||
vector signed int vec_C[4];
|
||||
acc_t acc_0;
|
||||
bool isAblock_q4 = std::is_same_v<TA, block_q4_0>;
|
||||
|
||||
if (end > tiles)
|
||||
end = tiles;
|
||||
for (int64_t job = start; job < end; ++job) {
|
||||
int64_t ii = m0 + job / xtiles * RM;
|
||||
int64_t jj = n0 + job % xtiles * RN;
|
||||
std::array<int, 4> comparray{};
|
||||
std::array<int, RM> comparray;
|
||||
vector float res[4] = {0};
|
||||
vector float fin_res[4] = {0};
|
||||
vector float vs[4] = {0};
|
||||
@ -2086,11 +1673,7 @@ class tinyBLAS_Q0_PPC {
|
||||
__builtin_prefetch((A+(ii*lda)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||
__builtin_prefetch((B+(jj*ldb)+(l+1))->qs, 0, 1); // prefetch one loop ahead
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
if (isAblock_q4) {
|
||||
packNormalInt4<int8_t, vector signed char, 4>((A+(ii*lda)+l), lda, RM, 4, (int8_t*)vec_A, comparray);
|
||||
} else {
|
||||
packNormal<int8_t, vector signed char>((const TB*)(A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
|
||||
}
|
||||
packNormal<int8_t, vector signed char>((A+(ii*lda)+l), lda, RM, 8, (int8_t*)vec_A, false);
|
||||
packNormal<uint8_t, vector unsigned char>((B+(jj*ldb)+l), ldb, RN, 8, (uint8_t*)vec_B, true);
|
||||
for(int x = 0; x < 8; x+=4) {
|
||||
__builtin_mma_xvi8ger4pp(&acc_0, vec_A[x], vec_B[x]);
|
||||
@ -2104,18 +1687,17 @@ class tinyBLAS_Q0_PPC {
|
||||
}
|
||||
}
|
||||
__builtin_mma_disassemble_acc(vec_C, &acc_0);
|
||||
if (!isAblock_q4) {
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < RM; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
auto *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
auto aoffset = A+(ii*lda)+l;
|
||||
for (int i = 0; i < RM; i++) {
|
||||
comparray[i] = 0;
|
||||
int ca = 0;
|
||||
const int8_t *at = aoffset->qs;
|
||||
for (int j = 0; j < 32; j++)
|
||||
ca += (int)*at++;
|
||||
comparray[i] = ca;
|
||||
aoffset += lda;
|
||||
}
|
||||
|
||||
for (int i = 0; i < RM; i++) {
|
||||
CA[i] = vec_splats((float)(((double)comparray[i]) * -128.0));
|
||||
res[i] = vec_add(vec_ctf(vec_C[i], 0), CA[i]);
|
||||
@ -2431,7 +2013,6 @@ class tinyBLAS_PPC {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void KERNEL_4x4(int64_t ii, int64_t jj) {
|
||||
vec_t vec_A[4], vec_B[4], vec_C[4];
|
||||
acc_t acc_0;
|
||||
@ -2678,27 +2259,15 @@ class tinyBLAS_PPC {
|
||||
vec_t vec_C[4];
|
||||
acc_t acc_0;
|
||||
__builtin_mma_xxsetaccz(&acc_0);
|
||||
vec_t vec_A[4] {0}, vec_B[4] = {0};
|
||||
vec_t vec_A[4], vec_B[4];
|
||||
for (int l=0; l<k; l+=4) {
|
||||
/* 'GEMV Forwarding' concept is used in first two conditional loops.
|
||||
* when one of the matrix has a single row/column, the elements are
|
||||
* broadcasted, instead of using packing routine to prepack the
|
||||
* matrix elements.
|
||||
*/
|
||||
if (RM == 1) {
|
||||
if (RN >= 4 && RM == 1) {
|
||||
TA* a = const_cast<TA*>(A+(ii)*lda+l);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, 4, 4, (TA*)vec_B);
|
||||
vec_A[0] = (vec_t)vec_xl(0,a);
|
||||
vec_A[1] = (vec_t)vec_splats(*((TA*)&vec_A+1));
|
||||
vec_A[2] = (vec_t)vec_splats(*((TA*)&vec_A+2));
|
||||
vec_A[3] = (vec_t)vec_splats(*((TA*)&vec_A+3));
|
||||
} else if (RN == 1) {
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
|
||||
TB* b = const_cast<TB*>(B+(jj)*ldb+l);
|
||||
vec_B[0] = (vec_t)vec_xl(0,b);
|
||||
vec_B[1] = (vec_t)vec_splats(*((TB*)&vec_B+1));
|
||||
vec_B[2] = (vec_t)vec_splats(*((TB*)&vec_B+2));
|
||||
vec_B[3] = (vec_t)vec_splats(*((TB*)&vec_B+3));
|
||||
} else {
|
||||
packTranspose<vector float>(A+(ii*lda)+l, lda, RM, 4, (TA*)vec_A);
|
||||
packTranspose<vector float>(B+(jj*ldb)+l, ldb, RN, 4, (TA*)vec_B);
|
||||
@ -2802,10 +2371,8 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
assert(params->ith < params->nth);
|
||||
|
||||
// only enable sgemm for prompt processing
|
||||
#if !defined(__MMA__)
|
||||
if (n < 2)
|
||||
return false;
|
||||
#endif
|
||||
|
||||
if (Ctype != GGML_TYPE_F32)
|
||||
return false;
|
||||
@ -2936,8 +2503,8 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
|
||||
#elif defined(__MMA__)
|
||||
//TO-DO: Remove this condition once gemv forwarding is enabled.
|
||||
if (n < 8 && n != 4)
|
||||
return false;
|
||||
if (m < 8 && m != 4)
|
||||
@ -2949,6 +2516,7 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
@ -2973,19 +2541,6 @@ bool llamafile_sgemm(const struct ggml_compute_params * params, int64_t m, int64
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#elif defined(__MMA__)
|
||||
//TO-DO: Remove this condition once gemv forwarding is enabled.
|
||||
if (n < 8 && n != 4)
|
||||
return false;
|
||||
if (m < 8 && m != 4)
|
||||
return false;
|
||||
tinyBLAS_Q0_PPC<block_q4_0, block_q8_0, float> tb{
|
||||
k, (const block_q4_0 *)A, lda,
|
||||
(const block_q8_0 *)B, ldb,
|
||||
(float *)C, ldc,
|
||||
params->ith, params->nth};
|
||||
tb.matmul(m, n);
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,128 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml.h"
|
||||
|
||||
//
|
||||
// cache line
|
||||
//
|
||||
|
||||
#if defined(__cpp_lib_hardware_interference_size)
|
||||
#define CACHE_LINE_SIZE std::hardware_destructive_interference_size
|
||||
#else
|
||||
#if defined(__POWER9_VECTOR__)
|
||||
#define CACHE_LINE_SIZE 128
|
||||
#elif defined(__VXE__) || defined(__VXE2__)
|
||||
#define CACHE_LINE_SIZE 256
|
||||
#else
|
||||
#define CACHE_LINE_SIZE 64
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float);
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_compute_forward_dup(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_acc(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sum(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sum_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_mean(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argmax(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_count_equal(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_repeat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_repeat_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_concat(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_silu_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rms_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rms_norm_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_group_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_l2_norm(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_out_prod(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_scale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_set(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cpy(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cont(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_reshape(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_view(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_permute(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_transpose(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rows(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rows_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_diag(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_diag_mask_inf(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_diag_mask_zero(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_soft_max(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_soft_max_ext_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rope(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rope_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_clamp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_transpose_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_im2col_back_f32(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_conv_transpose_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pool_2d_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_upscale(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_pad_reflect_1d(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_arange(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_timestep_embedding(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_argsort(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_leaky_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_flash_attn_ext(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * q,
|
||||
const struct ggml_tensor * k,
|
||||
const struct ggml_tensor * v,
|
||||
const struct ggml_tensor * mask,
|
||||
struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_flash_attn_back(
|
||||
const struct ggml_compute_params * params,
|
||||
const bool masked,
|
||||
struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_ssm_conv(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_ssm_scan(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_win_part(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_win_unpart(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_unary(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_get_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_add_rel_pos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_unary(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_unary_op_f32_t fun);
|
||||
void ggml_compute_forward_map_binary(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_binary_op_f32_t fun);
|
||||
void ggml_compute_forward_map_custom1_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_custom1_op_f32_t fun);
|
||||
void ggml_compute_forward_map_custom2_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_custom2_op_f32_t fun);
|
||||
void ggml_compute_forward_map_custom3_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst,
|
||||
const ggml_custom3_op_f32_t fun);
|
||||
void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cross_entropy_loss(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cross_entropy_loss_back(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_opt_step_adamw(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
@ -1,884 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "ggml-cpu-impl.h"
|
||||
|
||||
//
|
||||
// simd mappings
|
||||
//
|
||||
|
||||
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
||||
// we then implement the fundamental computation operations below using only these macros
|
||||
// adding support for new architectures requires to define the corresponding SIMD macros
|
||||
//
|
||||
// GGML_F32_STEP / GGML_F16_STEP
|
||||
// number of elements to process in a single step
|
||||
//
|
||||
// GGML_F32_EPR / GGML_F16_EPR
|
||||
// number of elements to fit in a single register
|
||||
//
|
||||
|
||||
#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FMA)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 NEON
|
||||
|
||||
#define GGML_F32_STEP 16
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 float32x4_t
|
||||
#define GGML_F32x4_ZERO vdupq_n_f32(0.0f)
|
||||
#define GGML_F32x4_SET1(x) vdupq_n_f32(x)
|
||||
#define GGML_F32x4_LOAD vld1q_f32
|
||||
#define GGML_F32x4_STORE vst1q_f32
|
||||
#define GGML_F32x4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
||||
#define GGML_F32x4_ADD vaddq_f32
|
||||
#define GGML_F32x4_MUL vmulq_f32
|
||||
#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
(res) = (ggml_float) GGML_F32x4_REDUCE_ONE((x)[0]); \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
// F16 NEON
|
||||
|
||||
#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
|
||||
#define GGML_F16_STEP 32
|
||||
#define GGML_F16_EPR 8
|
||||
|
||||
#define GGML_F16x8 float16x8_t
|
||||
#define GGML_F16x8_ZERO vdupq_n_f16(0.0f)
|
||||
#define GGML_F16x8_SET1(x) vdupq_n_f16(x)
|
||||
#define GGML_F16x8_LOAD(x) vld1q_f16((const ggml_fp16_internal_t *)(x))
|
||||
#define GGML_F16x8_STORE vst1q_f16
|
||||
#define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
|
||||
#define GGML_F16x8_ADD vaddq_f16
|
||||
#define GGML_F16x8_MUL vmulq_f16
|
||||
#define GGML_F16x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
(x)[i] = vaddq_f16((x)[i], (x)[offset+i]); \
|
||||
} \
|
||||
const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
|
||||
const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
|
||||
(res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F16x8
|
||||
#define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F16x8_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F16x8_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F16x8_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F16x8_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F16x8_REDUCE
|
||||
#else
|
||||
// if FP16 vector arithmetic is not supported, we use FP32 instead
|
||||
// and take advantage of the vcvt_ functions to convert to/from FP16
|
||||
|
||||
#define GGML_F16_STEP 16
|
||||
#define GGML_F16_EPR 4
|
||||
|
||||
#define GGML_F32Cx4 float32x4_t
|
||||
#define GGML_F32Cx4_ZERO vdupq_n_f32(0.0f)
|
||||
#define GGML_F32Cx4_SET1(x) vdupq_n_f32(x)
|
||||
#define GGML_F32Cx4_LOAD(x) vcvt_f32_f16(vld1_f16((const ggml_fp16_internal_t *)(x)))
|
||||
#define GGML_F32Cx4_STORE(x, y) vst1_f16(x, vcvt_f16_f32(y))
|
||||
#define GGML_F32Cx4_FMA(a, b, c) vfmaq_f32(a, b, c)
|
||||
#define GGML_F32Cx4_ADD vaddq_f32
|
||||
#define GGML_F32Cx4_MUL vmulq_f32
|
||||
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx4
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE((ggml_fp16_internal_t *)(p), r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||
#endif
|
||||
|
||||
#elif defined(__AVX512F__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 AVX512
|
||||
|
||||
#define GGML_F32_STEP 64
|
||||
#define GGML_F32_EPR 16
|
||||
|
||||
#define GGML_F32x16 __m512
|
||||
#define GGML_F32x16_ZERO _mm512_setzero_ps()
|
||||
#define GGML_F32x16_SET1(x) _mm512_set1_ps(x)
|
||||
#define GGML_F32x16_LOAD _mm512_loadu_ps
|
||||
#define GGML_F32x16_STORE _mm512_storeu_ps
|
||||
// _mm512_fmadd_ps is defined in AVX512F so no guard is required
|
||||
#define GGML_F32x16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
||||
#define GGML_F32x16_ADD _mm512_add_ps
|
||||
#define GGML_F32x16_MUL _mm512_mul_ps
|
||||
#define GGML_F32x16_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
|
||||
} while (0)
|
||||
|
||||
// TODO: is this optimal ?
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x16
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x16_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x16_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x16_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x16_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x16_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x16_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x16_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x16_REDUCE
|
||||
|
||||
// F16 AVX512
|
||||
|
||||
// F16 AVX
|
||||
|
||||
#define GGML_F16_STEP 64
|
||||
#define GGML_F16_EPR 16
|
||||
|
||||
// AVX512 has FP16 extension (AVX512_FP16) but I don't have it on my machine so I use FP32 instead
|
||||
|
||||
#define GGML_F32Cx16 __m512
|
||||
#define GGML_F32Cx16_ZERO _mm512_setzero_ps()
|
||||
#define GGML_F32Cx16_SET1(x) _mm512_set1_ps(x)
|
||||
|
||||
// unlike _mm256_cvt intrinsics that require F16C, _mm512_cvt is defined in AVX512F
|
||||
// so F16C guard isn't required
|
||||
#define GGML_F32Cx16_LOAD(x) _mm512_cvtph_ps(_mm256_loadu_si256((const __m256i *)(x)))
|
||||
#define GGML_F32Cx16_STORE(x, y) _mm256_storeu_si256((__m256i *)(x), _mm512_cvtps_ph(y, 0))
|
||||
|
||||
#define GGML_F32Cx16_FMA(a, b, c) _mm512_fmadd_ps(b, c, a)
|
||||
#define GGML_F32Cx16_ADD _mm512_add_ps
|
||||
#define GGML_F32Cx16_MUL _mm512_mul_ps
|
||||
#define GGML_F32Cx16_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm512_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = (ggml_float) _mm512_reduce_add_ps(x[0]); \
|
||||
} while (0)
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx16
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx16_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx16_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx16_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx16_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx16_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx16_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx16_MUL
|
||||
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx16_REDUCE
|
||||
#elif defined(__AVX__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 AVX
|
||||
|
||||
#define GGML_F32_STEP 32
|
||||
#define GGML_F32_EPR 8
|
||||
|
||||
#define GGML_F32x8 __m256
|
||||
#define GGML_F32x8_ZERO _mm256_setzero_ps()
|
||||
#define GGML_F32x8_SET1(x) _mm256_set1_ps(x)
|
||||
#define GGML_F32x8_LOAD _mm256_loadu_ps
|
||||
#define GGML_F32x8_STORE _mm256_storeu_ps
|
||||
#if defined(__FMA__)
|
||||
#define GGML_F32x8_FMA(a, b, c) _mm256_fmadd_ps(b, c, a)
|
||||
#else
|
||||
#define GGML_F32x8_FMA(a, b, c) _mm256_add_ps(_mm256_mul_ps(b, c), a)
|
||||
#endif
|
||||
#define GGML_F32x8_ADD _mm256_add_ps
|
||||
#define GGML_F32x8_MUL _mm256_mul_ps
|
||||
#define GGML_F32x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm256_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
const __m128 t0 = _mm_add_ps(_mm256_castps256_ps128(x[0]), \
|
||||
_mm256_extractf128_ps(x[0], 1)); \
|
||||
const __m128 t1 = _mm_hadd_ps(t0, t0); \
|
||||
res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t1, t1)); \
|
||||
} while (0)
|
||||
// TODO: is this optimal ?
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x8
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
||||
|
||||
// F16 AVX
|
||||
|
||||
#define GGML_F16_STEP 32
|
||||
#define GGML_F16_EPR 8
|
||||
|
||||
// F16 arithmetic is not supported by AVX, so we use F32 instead
|
||||
|
||||
#define GGML_F32Cx8 __m256
|
||||
#define GGML_F32Cx8_ZERO _mm256_setzero_ps()
|
||||
#define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x)
|
||||
|
||||
#if defined(__F16C__)
|
||||
// the _mm256_cvt intrinsics require F16C
|
||||
#define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
|
||||
#define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
|
||||
#else
|
||||
static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
|
||||
float tmp[8];
|
||||
|
||||
for (int i = 0; i < 8; i++) {
|
||||
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
|
||||
return _mm256_loadu_ps(tmp);
|
||||
}
|
||||
static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) {
|
||||
float arr[8];
|
||||
|
||||
_mm256_storeu_ps(arr, y);
|
||||
|
||||
for (int i = 0; i < 8; i++)
|
||||
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
||||
}
|
||||
#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x)
|
||||
#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y)
|
||||
#endif
|
||||
|
||||
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
||||
#define GGML_F32Cx8_ADD _mm256_add_ps
|
||||
#define GGML_F32Cx8_MUL _mm256_mul_ps
|
||||
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx8
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
||||
|
||||
#elif defined(__POWER9_VECTOR__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 POWER9
|
||||
|
||||
#define GGML_F32_STEP 32
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 vector float
|
||||
#define GGML_F32x4_ZERO 0.0f
|
||||
#define GGML_F32x4_SET1 vec_splats
|
||||
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
||||
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
||||
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
||||
#define GGML_F32x4_ADD vec_add
|
||||
#define GGML_F32x4_MUL vec_mul
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vec_add(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vec_add(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vec_add(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = vec_extract(x[0], 0) + \
|
||||
vec_extract(x[0], 1) + \
|
||||
vec_extract(x[0], 2) + \
|
||||
vec_extract(x[0], 3); \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
// F16 POWER9
|
||||
#define GGML_F16_STEP GGML_F32_STEP
|
||||
#define GGML_F16_EPR GGML_F32_EPR
|
||||
#define GGML_F16_VEC GGML_F32x4
|
||||
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
// Use vec_xl, not vec_ld, in case the load address is not aligned.
|
||||
#define GGML_F16_VEC_LOAD(p, i) (i & 0x1) ? \
|
||||
vec_extract_fp32_from_shorth(vec_xl(0, p - GGML_F16_EPR)) : \
|
||||
vec_extract_fp32_from_shortl(vec_xl(0, p))
|
||||
#define GGML_ENDIAN_BYTE(i) ((unsigned char *)&(uint16_t){1})[i]
|
||||
#define GGML_F16_VEC_STORE(p, r, i) \
|
||||
if (i & 0x1) \
|
||||
vec_xst(vec_pack_to_short_fp32(r[i - GGML_ENDIAN_BYTE(1)], \
|
||||
r[i - GGML_ENDIAN_BYTE(0)]), \
|
||||
0, p - GGML_F16_EPR)
|
||||
|
||||
#elif defined(__wasm_simd128__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 WASM
|
||||
|
||||
#define GGML_F32_STEP 16
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 v128_t
|
||||
#define GGML_F32x4_ZERO wasm_f32x4_splat(0.0f)
|
||||
#define GGML_F32x4_SET1(x) wasm_f32x4_splat(x)
|
||||
#define GGML_F32x4_LOAD wasm_v128_load
|
||||
#define GGML_F32x4_STORE wasm_v128_store
|
||||
#define GGML_F32x4_FMA(a, b, c) wasm_f32x4_add(wasm_f32x4_mul(b, c), a)
|
||||
#define GGML_F32x4_ADD wasm_f32x4_add
|
||||
#define GGML_F32x4_MUL wasm_f32x4_mul
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = wasm_f32x4_extract_lane(x[0], 0) + \
|
||||
wasm_f32x4_extract_lane(x[0], 1) + \
|
||||
wasm_f32x4_extract_lane(x[0], 2) + \
|
||||
wasm_f32x4_extract_lane(x[0], 3); \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
// F16 WASM
|
||||
|
||||
#define GGML_F16_STEP 16
|
||||
#define GGML_F16_EPR 4
|
||||
|
||||
inline static v128_t __wasm_f16x4_load(const ggml_fp16_t * p) {
|
||||
float tmp[4];
|
||||
|
||||
tmp[0] = GGML_FP16_TO_FP32(p[0]);
|
||||
tmp[1] = GGML_FP16_TO_FP32(p[1]);
|
||||
tmp[2] = GGML_FP16_TO_FP32(p[2]);
|
||||
tmp[3] = GGML_FP16_TO_FP32(p[3]);
|
||||
|
||||
return wasm_v128_load(tmp);
|
||||
}
|
||||
|
||||
inline static void __wasm_f16x4_store(ggml_fp16_t * p, v128_t x) {
|
||||
float tmp[4];
|
||||
|
||||
wasm_v128_store(tmp, x);
|
||||
|
||||
p[0] = GGML_FP32_TO_FP16(tmp[0]);
|
||||
p[1] = GGML_FP32_TO_FP16(tmp[1]);
|
||||
p[2] = GGML_FP32_TO_FP16(tmp[2]);
|
||||
p[3] = GGML_FP32_TO_FP16(tmp[3]);
|
||||
}
|
||||
|
||||
#define GGML_F16x4 v128_t
|
||||
#define GGML_F16x4_ZERO wasm_f32x4_splat(0.0f)
|
||||
#define GGML_F16x4_SET1(x) wasm_f32x4_splat(x)
|
||||
#define GGML_F16x4_LOAD(x) __wasm_f16x4_load(x)
|
||||
#define GGML_F16x4_STORE(x, y) __wasm_f16x4_store(x, y)
|
||||
#define GGML_F16x4_FMA GGML_F32x4_FMA
|
||||
#define GGML_F16x4_ADD wasm_f32x4_add
|
||||
#define GGML_F16x4_MUL wasm_f32x4_mul
|
||||
#define GGML_F16x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F16_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = wasm_f32x4_add(x[i], x[offset+i]); \
|
||||
} \
|
||||
res = (ggml_float) (wasm_f32x4_extract_lane(x[0], 0) + \
|
||||
wasm_f32x4_extract_lane(x[0], 1) + \
|
||||
wasm_f32x4_extract_lane(x[0], 2) + \
|
||||
wasm_f32x4_extract_lane(x[0], 3)); \
|
||||
}
|
||||
|
||||
#define GGML_F16_VEC GGML_F16x4
|
||||
#define GGML_F16_VEC_ZERO GGML_F16x4_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F16x4_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F16x4_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F16x4_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F16x4_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F16x4_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F16x4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F16x4_REDUCE
|
||||
|
||||
#elif defined(__SSE3__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 SSE
|
||||
|
||||
#define GGML_F32_STEP 32
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 __m128
|
||||
#define GGML_F32x4_ZERO _mm_setzero_ps()
|
||||
#define GGML_F32x4_SET1(x) _mm_set1_ps(x)
|
||||
#define GGML_F32x4_LOAD _mm_loadu_ps
|
||||
#define GGML_F32x4_STORE _mm_storeu_ps
|
||||
#if defined(__FMA__)
|
||||
// TODO: Does this work?
|
||||
#define GGML_F32x4_FMA(a, b, c) _mm_fmadd_ps(b, c, a)
|
||||
#else
|
||||
#define GGML_F32x4_FMA(a, b, c) _mm_add_ps(_mm_mul_ps(b, c), a)
|
||||
#endif
|
||||
#define GGML_F32x4_ADD _mm_add_ps
|
||||
#define GGML_F32x4_MUL _mm_mul_ps
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = _mm_add_ps(x[i], x[offset+i]); \
|
||||
} \
|
||||
const __m128 t0 = _mm_hadd_ps(x[0], x[0]); \
|
||||
res = (ggml_float) _mm_cvtss_f32(_mm_hadd_ps(t0, t0)); \
|
||||
}
|
||||
// TODO: is this optimal ?
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
// F16 SSE
|
||||
|
||||
#define GGML_F16_STEP 32
|
||||
#define GGML_F16_EPR 4
|
||||
|
||||
static inline __m128 __sse_f16x4_load(const ggml_fp16_t * x) {
|
||||
float tmp[4];
|
||||
|
||||
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
||||
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
||||
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
||||
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
||||
|
||||
return _mm_loadu_ps(tmp);
|
||||
}
|
||||
|
||||
static inline void __sse_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
float arr[4];
|
||||
|
||||
_mm_storeu_ps(arr, y);
|
||||
|
||||
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
||||
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
||||
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
||||
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
||||
}
|
||||
|
||||
#define GGML_F32Cx4 __m128
|
||||
#define GGML_F32Cx4_ZERO _mm_setzero_ps()
|
||||
#define GGML_F32Cx4_SET1(x) _mm_set1_ps(x)
|
||||
#define GGML_F32Cx4_LOAD(x) __sse_f16x4_load(x)
|
||||
#define GGML_F32Cx4_STORE(x, y) __sse_f16x4_store(x, y)
|
||||
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32Cx4_ADD _mm_add_ps
|
||||
#define GGML_F32Cx4_MUL _mm_mul_ps
|
||||
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx4
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||
|
||||
#elif defined(__loongarch_asx)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 LASX
|
||||
#define GGML_F32_STEP 32
|
||||
#define GGML_F32_EPR 8
|
||||
|
||||
#define GGML_F32x8 __m256
|
||||
#define GGML_F32x8_ZERO (__m256)__lasx_xvldi(0)
|
||||
#define GGML_F32x8_SET1(x) (__m256)__lasx_xvreplfr2vr_s((x))
|
||||
#define GGML_F32x8_LOAD(x) (__m256)__lasx_xvld((x), 0)
|
||||
#define GGML_F32x8_STORE(x,y) __lasx_xvst((y), (x), 0)
|
||||
#define GGML_F32x8_FMA(a, b, c) __lasx_xvfmadd_s(b, c, a)
|
||||
#define GGML_F32x8_ADD __lasx_xvfadd_s
|
||||
#define GGML_F32x8_MUL __lasx_xvfmul_s
|
||||
#define GGML_F32x8_REDUCE(res, x) \
|
||||
do { \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = __lasx_xvfadd_s(x[i], x[offset+i]); \
|
||||
} \
|
||||
float *tmp_p = (float *)&x[0]; \
|
||||
res = tmp_p[0] + tmp_p[1] + tmp_p[2] + tmp_p[3] + tmp_p[4] + tmp_p[5] + tmp_p[6] + tmp_p[7]; \
|
||||
} while (0)
|
||||
// TODO: is this optimal ?
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x8
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x8_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x8_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x8_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x8_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x8_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x8_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x8_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x8_REDUCE
|
||||
|
||||
// F16 LASX
|
||||
|
||||
#define GGML_F16_STEP 32
|
||||
#define GGML_F16_EPR 8
|
||||
|
||||
// F16 arithmetic is not supported by LASX, so we use F32 instead
|
||||
|
||||
#define GGML_F32Cx8 __m256
|
||||
#define GGML_F32Cx8_ZERO (__m256)__lasx_xvldi(0)
|
||||
#define GGML_F32Cx8_SET1(x) (__m256)__lasx_xvreplgr2vr_w((x))
|
||||
|
||||
static inline __m256 __lasx_f32cx8_load(const ggml_fp16_t * x) {
|
||||
__m256i a;
|
||||
memcpy(&a, x, sizeof(ggml_fp16_t) * 8);
|
||||
a = __lasx_xvpermi_d(a, 0 | (1 << 4));
|
||||
return __lasx_xvfcvtl_s_h(a);
|
||||
}
|
||||
|
||||
static inline void __lasx_f32cx8_store(ggml_fp16_t * x, __m256 y) {
|
||||
__m256i a = __lasx_xvfcvt_h_s(y, y);
|
||||
a = __lasx_xvpermi_d(a, 0 | (2 << 2));
|
||||
memcpy(x, &a, sizeof(ggml_fp16_t) * 8);
|
||||
}
|
||||
#define GGML_F32Cx8_LOAD(x) __lasx_f32cx8_load(x)
|
||||
#define GGML_F32Cx8_STORE(x, y) __lasx_f32cx8_store(x, y)
|
||||
|
||||
#define GGML_F32Cx8_FMA GGML_F32x8_FMA
|
||||
#define GGML_F32Cx8_ADD __lasx_xvfadd_s
|
||||
#define GGML_F32Cx8_MUL __lasx_xvfmul_s
|
||||
#define GGML_F32Cx8_REDUCE GGML_F32x8_REDUCE
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx8
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx8_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx8_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx8_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx8_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx8_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx8_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx8_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx8_REDUCE
|
||||
|
||||
#elif defined(__loongarch_sx)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 LSX
|
||||
|
||||
#define GGML_F32_STEP 32
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 __m128
|
||||
#define GGML_F32x4_ZERO __lsx_vldi(0)
|
||||
#define GGML_F32x4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32x4_LOAD(x) __lsx_vld((x), 0)
|
||||
#define GGML_F32x4_STORE((x),(y)) __lsx_vst((y), (x), 0)
|
||||
#define GGML_F32x4_FMA(a, b, c) __lsx_vfmadd_s(b, c, a)
|
||||
#define GGML_F32x4_ADD __lsx_vfadd_s
|
||||
#define GGML_F32x4_MUL __lsx_vfmul_s
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = __lsx_vfadd_s(x[i], x[offset + i]); \
|
||||
} \
|
||||
__m128i tmp = __lsx_vsrli_d((__m128i) x[0], 32); \
|
||||
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, x[0]); \
|
||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||
const __m128 t0 = __lsx_vshuf4i_w(tmp, 0x88); \
|
||||
tmp = __lsx_vsrli_d((__m128i) t0, 32); \
|
||||
tmp = (__m128i) __lsx_vfadd_s((__m128) tmp, t0); \
|
||||
tmp = __lsx_vpickev_w(__lsx_vldi(0), tmp); \
|
||||
res = (ggml_float) __lsx_vpickve2gr_w(__lsx_vshuf4i_w(tmp, 0x88), 0); \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
// F16 LSX
|
||||
|
||||
#define GGML_F16_STEP 32
|
||||
#define GGML_F16_EPR 4
|
||||
|
||||
static inline __m128 __lsx_f16x4_load(const ggml_fp16_t * x) {
|
||||
float tmp[4];
|
||||
|
||||
tmp[0] = GGML_FP16_TO_FP32(x[0]);
|
||||
tmp[1] = GGML_FP16_TO_FP32(x[1]);
|
||||
tmp[2] = GGML_FP16_TO_FP32(x[2]);
|
||||
tmp[3] = GGML_FP16_TO_FP32(x[3]);
|
||||
|
||||
return __lsx_vld(tmp, 0);
|
||||
}
|
||||
|
||||
static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
|
||||
float arr[4];
|
||||
|
||||
__lsx_vst(y, arr, 0);
|
||||
|
||||
x[0] = GGML_FP32_TO_FP16(arr[0]);
|
||||
x[1] = GGML_FP32_TO_FP16(arr[1]);
|
||||
x[2] = GGML_FP32_TO_FP16(arr[2]);
|
||||
x[3] = GGML_FP32_TO_FP16(arr[3]);
|
||||
}
|
||||
|
||||
#define GGML_F32Cx4 __m128
|
||||
#define GGML_F32Cx4_ZERO __lsx_vldi(0)
|
||||
#define GGML_F32Cx4_SET1(x) __lsx_vinsgr2vr_w(__lsx_vldi(0),(x), 0)
|
||||
#define GGML_F32Cx4_LOAD(x) __lsx_f16x4_load(x)
|
||||
#define GGML_F32Cx4_STORE(x, y) __lsx_f16x4_store(x, y)
|
||||
#define GGML_F32Cx4_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32Cx4_ADD __lsx_vfadd_s
|
||||
#define GGML_F32Cx4_MUL __lsx_vfmul_s
|
||||
#define GGML_F32Cx4_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
#define GGML_F16_VEC GGML_F32Cx4
|
||||
#define GGML_F16_VEC_ZERO GGML_F32Cx4_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32Cx4_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) GGML_F32Cx4_LOAD(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) GGML_F32Cx4_STORE(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32Cx4_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32Cx4_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32Cx4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32Cx4_REDUCE
|
||||
|
||||
#elif defined(__VXE__) || defined(__VXE2__)
|
||||
|
||||
#define GGML_SIMD
|
||||
|
||||
// F32 s390x
|
||||
|
||||
#define GGML_F32_STEP 32
|
||||
#define GGML_F32_EPR 4
|
||||
|
||||
#define GGML_F32x4 __vector float
|
||||
#define GGML_F32x4_ZERO vec_splats(0.0f)
|
||||
#define GGML_F32x4_SET1 vec_splats
|
||||
#define GGML_F32x4_LOAD(p) vec_xl(0, p)
|
||||
#define GGML_F32x4_STORE(p, r) vec_xst(r, 0, p)
|
||||
#define GGML_F32x4_FMA(a, b, c) vec_madd(b, c, a)
|
||||
#define GGML_F32x4_ADD vec_add
|
||||
#define GGML_F32x4_MUL vec_mul
|
||||
#define GGML_F32x4_REDUCE(res, x) \
|
||||
{ \
|
||||
int offset = GGML_F32_ARR >> 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vec_add(x[i], x[offset + i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vec_add(x[i], x[offset + i]); \
|
||||
} \
|
||||
offset >>= 1; \
|
||||
for (int i = 0; i < offset; ++i) { \
|
||||
x[i] = vec_add(x[i], x[offset + i]); \
|
||||
} \
|
||||
res = vec_extract(x[0], 0) + \
|
||||
vec_extract(x[0], 1) + \
|
||||
vec_extract(x[0], 2) + \
|
||||
vec_extract(x[0], 3); \
|
||||
}
|
||||
|
||||
#define GGML_F32_VEC GGML_F32x4
|
||||
#define GGML_F32_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F32_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F32_VEC_LOAD GGML_F32x4_LOAD
|
||||
#define GGML_F32_VEC_STORE GGML_F32x4_STORE
|
||||
#define GGML_F32_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F32_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F32_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F32_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
// F16 s390x
|
||||
#define GGML_F16_STEP GGML_F32_STEP
|
||||
#define GGML_F16_EPR GGML_F32_EPR
|
||||
|
||||
static inline __vector float __lzs_f16cx4_load(const ggml_fp16_t * x) {
|
||||
float tmp[4];
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
tmp[i] = GGML_FP16_TO_FP32(x[i]);
|
||||
}
|
||||
|
||||
return vec_xl(0, tmp);
|
||||
}
|
||||
|
||||
static inline void __lzs_f16cx4_store(ggml_fp16_t * x, __vector float y) {
|
||||
float arr[4];
|
||||
|
||||
vec_xst(y, 0, arr);
|
||||
|
||||
for (int i = 0; i < 4; i++) {
|
||||
x[i] = GGML_FP32_TO_FP16(arr[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#define GGML_F16_VEC GGML_F32x4
|
||||
#define GGML_F16_VEC_ZERO GGML_F32x4_ZERO
|
||||
#define GGML_F16_VEC_SET1 GGML_F32x4_SET1
|
||||
#define GGML_F16_VEC_LOAD(p, i) __lzs_f16cx4_load(p)
|
||||
#define GGML_F16_VEC_STORE(p, r, i) __lzs_f16cx4_store(p, r[i])
|
||||
#define GGML_F16_VEC_FMA GGML_F32x4_FMA
|
||||
#define GGML_F16_VEC_ADD GGML_F32x4_ADD
|
||||
#define GGML_F16_VEC_MUL GGML_F32x4_MUL
|
||||
#define GGML_F16_VEC_REDUCE GGML_F32x4_REDUCE
|
||||
|
||||
#endif
|
||||
|
||||
// GGML_F32_ARR / GGML_F16_ARR
|
||||
// number of registers to use per step
|
||||
#ifdef GGML_SIMD
|
||||
#define GGML_F32_ARR (GGML_F32_STEP/GGML_F32_EPR)
|
||||
#define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
|
||||
#endif
|
@ -1,186 +0,0 @@
|
||||
#include "unary-ops.h"
|
||||
|
||||
static inline float op_abs(float x) {
|
||||
return fabsf(x);
|
||||
}
|
||||
|
||||
static inline float op_sgn(float x) {
|
||||
return (x > 0.f) ? 1.f : ((x < 0.f) ? -1.f : 0.f);
|
||||
}
|
||||
|
||||
static inline float op_neg(float x) {
|
||||
return -x;
|
||||
}
|
||||
|
||||
static inline float op_step(float x) {
|
||||
return (x > 0.f) ? 1.f : 0.f;
|
||||
}
|
||||
|
||||
static inline float op_tanh(float x) {
|
||||
return tanhf(x);
|
||||
}
|
||||
|
||||
static inline float op_elu(float x) {
|
||||
return (x > 0.f) ? x : expm1f(x);
|
||||
}
|
||||
|
||||
static inline float op_relu(float x) {
|
||||
return (x > 0.f) ? x : 0.f;
|
||||
}
|
||||
|
||||
static inline float op_sigmoid(float x) {
|
||||
return 1.f / (1.f + expf(-x));
|
||||
}
|
||||
|
||||
static inline float op_hardsigmoid(float x) {
|
||||
return fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static inline float op_exp(float x) {
|
||||
return expf(x);
|
||||
}
|
||||
|
||||
static inline float op_hardswish(float x) {
|
||||
return x * fminf(1.0f, fmaxf(0.0f, (x + 3.0f) / 6.0f));
|
||||
}
|
||||
|
||||
static inline float op_sqr(float x) {
|
||||
return x * x;
|
||||
}
|
||||
|
||||
static inline float op_sqrt(float x) {
|
||||
return sqrtf(x);
|
||||
}
|
||||
|
||||
static inline float op_sin(float x) {
|
||||
return sinf(x);
|
||||
}
|
||||
|
||||
static inline float op_cos(float x) {
|
||||
return cosf(x);
|
||||
}
|
||||
|
||||
static inline float op_log(float x) {
|
||||
return logf(x);
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename src0_t, typename dst_t>
|
||||
static inline void vec_unary_op(int64_t n, dst_t * y, const src0_t * x) {
|
||||
constexpr auto src0_to_f32 = type_conversion_table<src0_t>::to_f32;
|
||||
constexpr auto f32_to_dst = type_conversion_table<dst_t >::from_f32;
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
y[i] = f32_to_dst(op(src0_to_f32(x[i])));
|
||||
}
|
||||
}
|
||||
|
||||
template <float (*op)(float), typename src0_t, typename dst_t>
|
||||
static void apply_unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
GGML_ASSERT(ggml_is_contiguous_1(src0) && ggml_is_contiguous_1(dst) && ggml_are_same_shape(src0, dst));
|
||||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS
|
||||
|
||||
GGML_ASSERT( nb0 == sizeof(dst_t));
|
||||
GGML_ASSERT(nb00 == sizeof(src0_t));
|
||||
|
||||
const auto [ir0, ir1] = get_thread_range(params, src0);
|
||||
|
||||
for (int64_t ir = ir0; ir < ir1; ++ir) {
|
||||
const int64_t i03 = ir/(ne02*ne01);
|
||||
const int64_t i02 = (ir - i03*ne02*ne01)/ne01;
|
||||
const int64_t i01 = (ir - i03*ne02*ne01 - i02*ne01);
|
||||
|
||||
dst_t * dst_ptr = (dst_t *) ((char *) dst->data + i03*nb3 + i02*nb2 + i01*nb1 );
|
||||
const src0_t * src0_ptr = (const src0_t *) ((const char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01);
|
||||
|
||||
vec_unary_op<op>(ne0, dst_ptr, src0_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Use the 'traits' lookup table (for type conversion fns), instead of a mass of 'if' conditions with long templates
|
||||
template <float (*op)(float)>
|
||||
static void unary_op(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
/* */ if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { // all f32
|
||||
apply_unary_op<op, float, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { // all f16
|
||||
apply_unary_op<op, ggml_fp16_t, ggml_fp16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_BF16) { // all bf16
|
||||
apply_unary_op<op, ggml_bf16_t, ggml_bf16_t>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_BF16 && dst->type == GGML_TYPE_F32) {
|
||||
apply_unary_op<op, ggml_bf16_t, float>(params, dst);
|
||||
} else if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F32) {
|
||||
apply_unary_op<op, ggml_fp16_t, float>(params, dst);
|
||||
} else {
|
||||
fprintf(stderr, "%s: unsupported types: dst: %s, src0: %s\n", __func__,
|
||||
ggml_type_name(dst->type), ggml_type_name(src0->type));
|
||||
GGML_ABORT("fatal error");
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_compute_forward_abs(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_abs>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sgn(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sgn>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_neg(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_neg>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_step(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_step>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_tanh(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_tanh>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_elu(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_elu>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_relu(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_relu>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sigmoid>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_hardsigmoid(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_hardsigmoid>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_exp(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_exp>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_hardswish(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_hardswish>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sqr(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sqr>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sqrt(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sqrt>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_sin(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_sin>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_cos(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_cos>(params, dst);
|
||||
}
|
||||
|
||||
void ggml_compute_forward_log(const ggml_compute_params * params, ggml_tensor * dst) {
|
||||
unary_op<op_log>(params, dst);
|
||||
}
|
@ -1,28 +0,0 @@
|
||||
#pragma once
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ggml_compute_forward_abs(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sgn(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_neg(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_step(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_tanh(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_elu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_relu(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_hardsigmoid(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_exp(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_hardswish(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sqr(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sqrt(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_sin(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_cos(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
void ggml_compute_forward_log(const struct ggml_compute_params * params, struct ggml_tensor * dst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user