mirror of
https://github.com/ggerganov/whisper.cpp.git
synced 2025-07-03 16:01:03 +02:00
Compare commits
1 Commits
java-bindi
...
fix-coreml
Author | SHA1 | Date | |
---|---|---|---|
8cbc363561 |
217
.github/workflows/build.yml
vendored
217
.github/workflows/build.yml
vendored
@ -1,41 +1,31 @@
|
||||
name: CI
|
||||
on: [push, pull_request]
|
||||
|
||||
env:
|
||||
ubuntu_image: "ubuntu:22.04"
|
||||
|
||||
jobs:
|
||||
ubuntu-latest:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
- name: Dependencies
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
apt update
|
||||
apt install -y build-essential libsdl2-dev
|
||||
make
|
||||
make stream'
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
make
|
||||
make stream
|
||||
|
||||
macOS-latest:
|
||||
runs-on: macOS-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Dependencies
|
||||
run: |
|
||||
@ -47,104 +37,82 @@ jobs:
|
||||
make
|
||||
make stream
|
||||
|
||||
freeBSD-latest:
|
||||
runs-on: macos-12
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Build
|
||||
uses: cross-platform-actions/action@v0.15.0
|
||||
with:
|
||||
operating_system: freebsd
|
||||
version: '13.2'
|
||||
run: |
|
||||
sudo pkg update
|
||||
sudo pkg install -y gmake sdl2
|
||||
gmake
|
||||
gmake stream
|
||||
|
||||
ubuntu-latest-gcc:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
- name: Dependencies
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
apt update
|
||||
apt install -y build-essential cmake libsdl2-dev
|
||||
cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install cmake
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
- name: Configure
|
||||
run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
make
|
||||
ctest -L gh --output-on-failure
|
||||
|
||||
ubuntu-latest-clang:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
build: [Debug, Release]
|
||||
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
- name: Dependencies
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
apt update
|
||||
apt install -y build-essential cmake libsdl2-dev
|
||||
cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install cmake
|
||||
sudo apt-get install libsdl2-dev
|
||||
|
||||
- name: Configure
|
||||
run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
make
|
||||
ctest -L gh --output-on-failure
|
||||
|
||||
ubuntu-latest-gcc-sanitized:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
strategy:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
sanitizer: [ADDRESS, THREAD, UNDEFINED]
|
||||
arch: [linux/amd64]
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v2
|
||||
|
||||
- name: Build ${{ matrix.arch }}
|
||||
- name: Dependencies
|
||||
run: |
|
||||
docker run --platform ${{ matrix.arch }} --rm \
|
||||
-v ${{ github.workspace }}:/workspace \
|
||||
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
|
||||
apt update
|
||||
apt install -y build-essential cmake
|
||||
cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
make
|
||||
ctest -L gh --output-on-failure'
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential
|
||||
sudo apt-get install cmake
|
||||
|
||||
- name: Configure
|
||||
run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
make
|
||||
ctest -L gh --output-on-failure
|
||||
|
||||
windows:
|
||||
runs-on: windows-latest
|
||||
@ -166,7 +134,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v1
|
||||
@ -227,7 +195,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v1
|
||||
@ -275,10 +243,10 @@ jobs:
|
||||
with:
|
||||
name: whisper-blas-bin-${{ matrix.arch }}
|
||||
path: build/bin/${{ matrix.build }}
|
||||
|
||||
|
||||
windows-cublas:
|
||||
runs-on: windows-latest
|
||||
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
build: [Release]
|
||||
@ -290,40 +258,40 @@ jobs:
|
||||
s2arc: x64
|
||||
- sdl2: ON
|
||||
s2ver: 2.26.0
|
||||
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Add msbuild to PATH
|
||||
uses: microsoft/setup-msbuild@v1
|
||||
|
||||
|
||||
- name: Install CUDA Toolkit
|
||||
id: cuda-toolkit
|
||||
uses: Jimver/cuda-toolkit@v0.2.10
|
||||
|
||||
|
||||
- name: Fetch SDL2 and set SDL2_DIR
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: |
|
||||
C:/msys64/usr/bin/wget.exe -qO sdl2.zip https://github.com/libsdl-org/SDL/releases/download/release-${{ matrix.s2ver }}/SDL2-devel-${{ matrix.s2ver }}-VC.zip
|
||||
7z x sdl2.zip
|
||||
echo "SDL2_DIR=$env:GITHUB_WORKSPACE/SDL2-${{ matrix.s2ver }}/cmake" >> $env:GITHUB_ENV
|
||||
|
||||
|
||||
- name: Configure
|
||||
run: >
|
||||
cmake -S . -B ./build -A ${{ matrix.arch }}
|
||||
-DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
-DWHISPER_CUBLAS=1
|
||||
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
cd ./build
|
||||
msbuild ALL_BUILD.vcxproj -t:build -p:configuration=${{ matrix.build }} -p:platform=${{ matrix.arch }}
|
||||
|
||||
|
||||
- name: Copy SDL2.dll
|
||||
if: matrix.sdl2 == 'ON'
|
||||
run: copy "$env:SDL2_DIR/../lib/${{ matrix.s2arc }}/SDL2.dll" build/bin/${{ matrix.build }}
|
||||
|
||||
|
||||
- name: Upload binaries
|
||||
if: matrix.sdl2 == 'ON'
|
||||
uses: actions/upload-artifact@v1
|
||||
@ -340,16 +308,24 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Setup emsdk
|
||||
uses: mymindstorm/setup-emsdk@v12
|
||||
- name: Dependencies
|
||||
run: |
|
||||
wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
|
||||
tar -xvf master.tar.gz
|
||||
emsdk-master/emsdk update
|
||||
emsdk-master/emsdk install latest
|
||||
emsdk-master/emsdk activate latest
|
||||
|
||||
- name: Verify
|
||||
run: emcc -v
|
||||
- name: Configure
|
||||
run: echo "tmp"
|
||||
|
||||
- name: Build
|
||||
run: |
|
||||
pushd emsdk-master
|
||||
source ./emsdk_env.sh
|
||||
popd
|
||||
emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
|
||||
make
|
||||
|
||||
@ -362,7 +338,7 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Configure
|
||||
run: |
|
||||
@ -371,7 +347,7 @@ jobs:
|
||||
|
||||
- name: Build objc example
|
||||
run: xcodebuild -project examples/whisper.objc/whisper.objc.xcodeproj -scheme whisper.objc -configuration ${{ matrix.build }} -sdk iphonesimulator build
|
||||
|
||||
|
||||
- name: Build swiftui example
|
||||
run: xcodebuild -project examples/whisper.swiftui/whisper.swiftui.xcodeproj -scheme WhisperCppDemo -configuration ${{ matrix.build }} -sdk iphonesimulator build
|
||||
|
||||
@ -380,14 +356,14 @@ jobs:
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
uses: actions/checkout@v1
|
||||
|
||||
- name: Install Java
|
||||
uses: actions/setup-java@v3
|
||||
with:
|
||||
distribution: zulu
|
||||
java-version: 17
|
||||
|
||||
|
||||
- name: Setup Android SDK
|
||||
uses: android-actions/setup-android@v2
|
||||
|
||||
@ -400,7 +376,7 @@ jobs:
|
||||
needs: [ 'windows' ]
|
||||
runs-on: windows-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- uses: actions/checkout@v1
|
||||
|
||||
- name: Install Java
|
||||
uses: actions/setup-java@v1
|
||||
@ -426,24 +402,11 @@ jobs:
|
||||
name: whispercpp.jar
|
||||
path: bindings/java/build/libs/whispercpp-*.jar
|
||||
|
||||
- name: Publish package
|
||||
if: ${{ github.ref == 'refs/heads/master' }}
|
||||
uses: gradle/gradle-build-action@v2
|
||||
with:
|
||||
arguments: publish
|
||||
env:
|
||||
MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
|
||||
MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
|
||||
|
||||
quantize:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Test quantize
|
||||
run: |
|
||||
./models/download-ggml-model.sh tiny.en
|
||||
make quantize
|
||||
./quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0
|
||||
# - name: Publish package
|
||||
# if: ${{ github.ref == 'refs/heads/master' }}
|
||||
# uses: gradle/gradle-build-action@v2
|
||||
# with:
|
||||
# arguments: publish
|
||||
# env:
|
||||
# MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
|
||||
# MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
|
||||
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -24,7 +24,6 @@ build-sanitize-thread/
|
||||
/talk-llama
|
||||
/bench
|
||||
/quantize
|
||||
/lsp
|
||||
|
||||
arm_neon.h
|
||||
sync.sh
|
||||
|
@ -65,7 +65,6 @@ else()
|
||||
option(WHISPER_BLAS_VENDOR "whisper: BLAS library vendor" Generic)
|
||||
option(WHISPER_OPENBLAS "whisper: prefer OpenBLAS" OFF)
|
||||
option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
|
||||
option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF)
|
||||
option(WHISPER_CLBLAST "whisper: use CLBlast" OFF)
|
||||
endif()
|
||||
|
||||
@ -137,34 +136,22 @@ if (WHISPER_OPENBLAS)
|
||||
endif()
|
||||
|
||||
if (WHISPER_BLAS)
|
||||
if (WIN32)
|
||||
if(DEFINED ENV{OPENBLAS_PATH})
|
||||
set(BLAS_LIBRARIES $ENV{OPENBLAS_PATH}/lib/libopenblas.dll.a)
|
||||
message(STATUS "Libraries ${BLAS_LIBRARIES}")
|
||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
|
||||
include_directories($ENV{OPENBLAS_PATH}/include)
|
||||
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||
else ()
|
||||
message(WARNING "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
|
||||
endif ()
|
||||
else ()
|
||||
set(BLA_STATIC 1)
|
||||
set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
|
||||
# set(BLA_PREFER_PKGCONFIG 1)
|
||||
set(BLA_SIZEOF_INTEGER 8)
|
||||
find_package(BLAS)
|
||||
set(BLA_STATIC 1)
|
||||
set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
|
||||
# set(BLA_PREFER_PKGCONFIG 1)
|
||||
set(BLA_SIZEOF_INTEGER 8)
|
||||
find_package(BLAS)
|
||||
|
||||
if(BLAS_FOUND)
|
||||
message(STATUS "BLAS compatible library found")
|
||||
message(STATUS "Libraries ${BLAS_LIBRARIES}")
|
||||
find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include/openblas /usr/local/include/openblas $ENV{BLAS_HOME}/include)
|
||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
|
||||
include_directories(${BLAS_INCLUDE_DIRS})
|
||||
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||
else()
|
||||
message(WARNING "BLAS library was not found")
|
||||
endif()
|
||||
endif ()
|
||||
if(BLAS_FOUND)
|
||||
message(STATUS "BLAS compatible library found")
|
||||
message(STATUS "Libraries ${BLAS_LIBRARIES}")
|
||||
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
|
||||
|
||||
include_directories(${BLAS_INCLUDE_DIRS})
|
||||
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
|
||||
else()
|
||||
message(WARNING "BLAS library was not found")
|
||||
endif()
|
||||
endif ()
|
||||
|
||||
if (WHISPER_CUBLAS)
|
||||
@ -192,37 +179,6 @@ if (WHISPER_CUBLAS)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
if (WHISPER_HIPBLAS)
|
||||
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
|
||||
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
|
||||
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
|
||||
endif()
|
||||
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
|
||||
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
|
||||
endif()
|
||||
|
||||
find_package(hip)
|
||||
find_package(hipblas)
|
||||
find_package(rocblas)
|
||||
|
||||
if (${hipblas_FOUND} AND ${hip_FOUND})
|
||||
message(STATUS "HIP and hipBLAS found")
|
||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
||||
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
||||
set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
|
||||
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
|
||||
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
|
||||
|
||||
if (WHISPER_STATIC)
|
||||
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
|
||||
endif()
|
||||
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ggml-rocm)
|
||||
else()
|
||||
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
|
||||
endif()
|
||||
endif()
|
||||
|
||||
if (WHISPER_CLBLAST)
|
||||
find_package(CLBlast)
|
||||
if (CLBlast_FOUND)
|
||||
@ -281,25 +237,20 @@ message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
|
||||
|
||||
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
|
||||
message(STATUS "ARM detected")
|
||||
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
|
||||
message(STATUS "PowerPC detected")
|
||||
else()
|
||||
message(STATUS "x86 detected")
|
||||
if (MSVC)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /utf-8")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8")
|
||||
if(NOT WHISPER_NO_AVX2)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
|
||||
else()
|
||||
if(NOT WHISPER_NO_AVX)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
|
||||
if(NOT WHISPER_NO_AVX2)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2")
|
||||
else()
|
||||
if(NOT WHISPER_NO_AVX)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
else()
|
||||
if (EMSCRIPTEN)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread")
|
||||
|
125
Makefile
125
Makefile
@ -12,12 +12,6 @@ ifndef UNAME_M
|
||||
UNAME_M := $(shell uname -m)
|
||||
endif
|
||||
|
||||
ifndef NVCC_VERSION
|
||||
ifeq ($(call,$(shell which nvcc))$(.SHELLSTATUS),0)
|
||||
NVCC_VERSION := $(shell nvcc --version | egrep -o "V[0-9]+.[0-9]+.[0-9]+" | cut -c2-)
|
||||
endif
|
||||
endif
|
||||
|
||||
CCV := $(shell $(CC) --version | head -n 1)
|
||||
CXXV := $(shell $(CXX) --version | head -n 1)
|
||||
|
||||
@ -57,7 +51,19 @@ endif
|
||||
|
||||
# OS specific
|
||||
# TODO: support Windows
|
||||
ifeq ($(filter $(UNAME_S),Linux Darwin DragonFly FreeBSD NetBSD OpenBSD Haiku),$(UNAME_S))
|
||||
ifeq ($(UNAME_S),Linux)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),FreeBSD)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
ifeq ($(UNAME_S),Haiku)
|
||||
CFLAGS += -pthread
|
||||
CXXFLAGS += -pthread
|
||||
endif
|
||||
@ -67,50 +73,60 @@ endif
|
||||
# feel free to update the Makefile for your architecture and send a pull request or issue
|
||||
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
CPUINFO_CMD := sysctl machdep.cpu.features
|
||||
else ifeq ($(UNAME_S),Linux)
|
||||
CPUINFO_CMD := cat /proc/cpuinfo
|
||||
else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S)))
|
||||
CPUINFO_CMD := cat /proc/cpuinfo
|
||||
else ifeq ($(UNAME_S),Haiku)
|
||||
CPUINFO_CMD := sysinfo -cpu
|
||||
endif
|
||||
|
||||
ifdef CPUINFO_CMD
|
||||
AVX_M := $(shell $(CPUINFO_CMD) | grep -m 1 "avx ")
|
||||
ifneq (,$(findstring avx,$(AVX_M)))
|
||||
CFLAGS += -mf16c
|
||||
AVX1_M := $(shell sysctl machdep.cpu.features)
|
||||
ifneq (,$(findstring FMA,$(AVX1_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
|
||||
AVX2_M := $(shell $(CPUINFO_CMD) | grep -m 1 "avx2 ")
|
||||
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
|
||||
ifneq (,$(findstring AVX2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
else ifeq ($(UNAME_S),Linux)
|
||||
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
|
||||
FMA_M := $(shell $(CPUINFO_CMD) | grep -m 1 "fma ")
|
||||
FMA_M := $(shell grep "fma " /proc/cpuinfo)
|
||||
ifneq (,$(findstring fma,$(FMA_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
|
||||
F16C_M := $(shell $(CPUINFO_CMD) | grep -m 1 "f16c ")
|
||||
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
|
||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
|
||||
AVX1_M := $(shell $(CPUINFO_CMD) | grep -m 1 "avx ")
|
||||
AVX1_M := $(shell grep "avx " /proc/cpuinfo)
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
endif
|
||||
|
||||
SSE3_M := $(shell $(CPUINFO_CMD) | grep -m 1 "sse3 ")
|
||||
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
|
||||
ifneq (,$(findstring sse3,$(SSE3_M)))
|
||||
CFLAGS += -msse3
|
||||
endif
|
||||
|
||||
SSSE3_M := $(shell $(CPUINFO_CMD) | grep -m 1 "ssse3 ")
|
||||
ifneq (,$(findstring ssse3,$(SSSE3_M)))
|
||||
CFLAGS += -mssse3
|
||||
else ifeq ($(UNAME_S),Haiku)
|
||||
AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
|
||||
ifneq (,$(findstring avx2,$(AVX2_M)))
|
||||
CFLAGS += -mavx2
|
||||
endif
|
||||
FMA_M := $(shell sysinfo -cpu | grep "FMA ")
|
||||
ifneq (,$(findstring fma,$(FMA_M)))
|
||||
CFLAGS += -mfma
|
||||
endif
|
||||
F16C_M := $(shell sysinfo -cpu | grep "F16C ")
|
||||
ifneq (,$(findstring f16c,$(F16C_M)))
|
||||
CFLAGS += -mf16c
|
||||
|
||||
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
|
||||
ifneq (,$(findstring avx,$(AVX1_M)))
|
||||
CFLAGS += -mavx
|
||||
endif
|
||||
endif
|
||||
else
|
||||
CFLAGS += -mfma -mf16c -mavx -mavx2
|
||||
endif
|
||||
endif
|
||||
ifeq ($(UNAME_M),amd64)
|
||||
@ -146,56 +162,29 @@ endif
|
||||
endif
|
||||
|
||||
ifdef WHISPER_OPENBLAS
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas
|
||||
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
|
||||
LDFLAGS += -lopenblas
|
||||
endif
|
||||
|
||||
ifdef WHISPER_CUBLAS
|
||||
ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
|
||||
CUDA_ARCH_FLAG=native
|
||||
else
|
||||
CUDA_ARCH_FLAG=all
|
||||
endif
|
||||
|
||||
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
||||
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
||||
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
|
||||
WHISPER_OBJ += ggml-cuda.o
|
||||
NVCC = nvcc
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG)
|
||||
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=any
|
||||
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
|
||||
endif
|
||||
|
||||
ifdef WHISPER_HIPBLAS
|
||||
ROCM_PATH ?= /opt/rocm
|
||||
HIPCC ?= $(ROCM_PATH)/bin/hipcc
|
||||
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
|
||||
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
|
||||
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
|
||||
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
|
||||
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
|
||||
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
|
||||
WHISPER_OBJ += ggml-cuda.o
|
||||
|
||||
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
|
||||
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
|
||||
endif
|
||||
|
||||
ifdef WHISPER_CLBLAST
|
||||
CFLAGS += -DGGML_USE_CLBLAST
|
||||
CXXFLAGS += -DGGML_USE_CLBLAST
|
||||
LDFLAGS += -lclblast
|
||||
ifeq ($(UNAME_S),Darwin)
|
||||
LDFLAGS += -framework OpenCL
|
||||
else
|
||||
LDFLAGS += -lOpenCL
|
||||
endif
|
||||
LDFLAGS += -lclblast -lOpenCL
|
||||
WHISPER_OBJ += ggml-opencl.o
|
||||
|
||||
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
|
||||
$(CXX) $(CXXFLAGS) -c $< -o $@
|
||||
$(CC) $(CFLAGS) -c $< -o $@
|
||||
endif
|
||||
|
||||
ifdef WHISPER_GPROF
|
||||
@ -273,7 +262,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
|
||||
|
||||
clean:
|
||||
rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so
|
||||
rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
|
||||
|
||||
#
|
||||
# Examples
|
||||
@ -300,9 +289,6 @@ stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHIS
|
||||
command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
|
||||
|
||||
lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o lsp $(CC_SDL) $(LDFLAGS)
|
||||
|
||||
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
|
||||
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
|
||||
|
||||
@ -323,7 +309,6 @@ samples:
|
||||
@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
|
||||
@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
|
||||
@wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
|
||||
@wget --quiet --show-progress -O samples/diffusion2023-07-03.flac https://archive.org/download/diffusion2023-07-03/diffusion2023-07-03.flac
|
||||
@echo "Converting to 16-bit WAV ..."
|
||||
@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
|
||||
@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
|
||||
@ -333,8 +318,6 @@ samples:
|
||||
@rm samples/mm1.wav
|
||||
@ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav
|
||||
@rm samples/a13.mp3
|
||||
@ffmpeg -loglevel -0 -y -i samples/diffusion2023-07-03.flac -ar 16000 -ac 1 -c:a pcm_s16le samples/diffusion2023-07-03.wav
|
||||
@rm samples/diffusion2023-07-03.flac
|
||||
|
||||
#
|
||||
# Models
|
||||
@ -376,4 +359,4 @@ tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
|
||||
|
||||
.PHONY: tests
|
||||
tests:
|
||||
bash ./tests/run-tests.sh $(word 2, $(MAKECMDGOALS))
|
||||
bash ./tests/run-tests.sh
|
||||
|
82
README.md
82
README.md
@ -22,7 +22,6 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
|
||||
- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
|
||||
- [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
|
||||
- [BLAS CPU support via OpenBLAS](https://github.com/ggerganov/whisper.cpp#blas-cpu-support-via-openblas)
|
||||
- [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
|
||||
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
|
||||
|
||||
Supported platforms:
|
||||
@ -61,7 +60,7 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
|
||||
- Various other examples are available in the [examples](examples) folder
|
||||
|
||||
The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
|
||||
intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
|
||||
instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
|
||||
the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
|
||||
|
||||
## Quick start
|
||||
@ -312,85 +311,6 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
|
||||
|
||||
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
|
||||
|
||||
## OpenVINO support
|
||||
|
||||
On platforms that support [OpenVINO](https://github.com/openvinotoolkit/openvino), the Encoder inference can be executed
|
||||
on OpenVINO-supported devices including x86 CPUs and Intel GPUs (integrated & discrete).
|
||||
|
||||
This can result in significant speedup in encoder performance. Here are the instructions for generating the OpenVINO model and using it with `whisper.cpp`:
|
||||
|
||||
- First, setup python virtual env. and install python dependencies. Python 3.10 is recommended.
|
||||
|
||||
Windows:
|
||||
```
|
||||
cd models
|
||||
python -m venv openvino_conv_env
|
||||
openvino_conv_env\Scripts\activate
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r openvino-conversion-requirements.txt
|
||||
```
|
||||
|
||||
Linux and macOS:
|
||||
```
|
||||
cd models
|
||||
python3 -m venv openvino_conv_env
|
||||
source openvino_conv_env/bin/activate
|
||||
python -m pip install --upgrade pip
|
||||
pip install -r openvino-conversion-requirements.txt
|
||||
```
|
||||
|
||||
- Generate an OpenVINO encoder model. For example, to generate a `base.en` model, use:
|
||||
|
||||
```
|
||||
python convert-whisper-to-openvino.py --model base.en
|
||||
```
|
||||
|
||||
This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as ggml models, as that
|
||||
is the default location that the OpenVINO extension will search at runtime.
|
||||
|
||||
- Build `whisper.cpp` with OpenVINO support:
|
||||
|
||||
Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2023.0.0](https://github.com/openvinotoolkit/openvino/releases/tag/2023.0.0).
|
||||
|
||||
After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:
|
||||
|
||||
Linux:
|
||||
```bash
|
||||
source /path/to/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
|
||||
```
|
||||
|
||||
Windows (cmd):
|
||||
```
|
||||
C:\Path\To\w_openvino_toolkit_windows_2023.0.0.10926.b4452d56304_x86_64\setupvars.bat
|
||||
```
|
||||
|
||||
And then build the project using cmake:
|
||||
```bash
|
||||
cd build
|
||||
cmake -DWHISPER_OPENVINO=1 ..
|
||||
```
|
||||
|
||||
- Run the examples as usual. For example:
|
||||
```bash
|
||||
./main -m models/ggml-base.en.bin -f samples/jfk.wav
|
||||
|
||||
...
|
||||
|
||||
whisper_ctx_init_openvino_encoder: loading OpenVINO model from 'models/ggml-base.en-encoder-openvino.xml'
|
||||
whisper_ctx_init_openvino_encoder: first run on a device may take a while ...
|
||||
whisper_openvino_init: path_model = models/ggml-base.en-encoder-openvino.xml, device = GPU, cache_dir = models/ggml-base.en-encoder-openvino-cache
|
||||
whisper_ctx_init_openvino_encoder: OpenVINO model loaded
|
||||
|
||||
system_info: n_threads = 4 / 8 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | COREML = 0 | OPENVINO = 1 |
|
||||
|
||||
...
|
||||
```
|
||||
|
||||
The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
|
||||
cached for the next run.
|
||||
|
||||
For more information about the Core ML implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
|
||||
|
||||
## NVIDIA GPU support via cuBLAS
|
||||
|
||||
With NVIDIA cards the Encoder processing can to a large extent be offloaded to the GPU through cuBLAS.
|
||||
|
@ -19,10 +19,6 @@ func (p *Params) SetTranslate(v bool) {
|
||||
p.translate = toBool(v)
|
||||
}
|
||||
|
||||
func (p *Params) SetSplitOnWord(v bool) {
|
||||
p.split_on_word = toBool(v)
|
||||
}
|
||||
|
||||
func (p *Params) SetNoContext(v bool) {
|
||||
p.no_context = toBool(v)
|
||||
}
|
||||
|
@ -81,10 +81,6 @@ func (context *context) SetSpeedup(v bool) {
|
||||
context.params.SetSpeedup(v)
|
||||
}
|
||||
|
||||
func (context *context) SetSplitOnWord(v bool) {
|
||||
context.params.SetSplitOnWord(v)
|
||||
}
|
||||
|
||||
// Set number of threads to use
|
||||
func (context *context) SetThreads(v uint) {
|
||||
context.params.SetThreads(int(v))
|
||||
|
@ -42,7 +42,6 @@ type Context interface {
|
||||
SetDuration(time.Duration) // Set duration
|
||||
SetThreads(uint) // Set number of threads to use
|
||||
SetSpeedup(bool) // Set speedup flag
|
||||
SetSplitOnWord(bool) // Set split on word flag
|
||||
SetTokenThreshold(float32) // Set timestamp token probability threshold
|
||||
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
|
||||
SetMaxSegmentLength(uint) // Set max segment length in characters
|
||||
|
@ -31,10 +31,10 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
|
||||
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
|
||||
@interface whisper_decoder_implOutput : NSObject<MLFeatureProvider>
|
||||
|
||||
/// var_1346 as multidimensional array of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * var_1346;
|
||||
/// var_1195 as multidimensional array of floats
|
||||
@property (readwrite, nonatomic, strong) MLMultiArray * var_1195;
|
||||
- (instancetype)init NS_UNAVAILABLE;
|
||||
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER;
|
||||
- (instancetype)initWithVar_1195:(MLMultiArray *)var_1195 NS_DESIGNATED_INITIALIZER;
|
||||
|
||||
@end
|
||||
|
||||
|
@ -39,21 +39,21 @@
|
||||
|
||||
@implementation whisper_decoder_implOutput
|
||||
|
||||
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 {
|
||||
- (instancetype)initWithVar_1195:(MLMultiArray *)var_1195 {
|
||||
self = [super init];
|
||||
if (self) {
|
||||
_var_1346 = var_1346;
|
||||
_var_1195 = var_1195;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (NSSet<NSString *> *)featureNames {
|
||||
return [NSSet setWithArray:@[@"var_1346"]];
|
||||
return [NSSet setWithArray:@[@"var_1195"]];
|
||||
}
|
||||
|
||||
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
|
||||
if ([featureName isEqualToString:@"var_1346"]) {
|
||||
return [MLFeatureValue featureValueWithMultiArray:self.var_1346];
|
||||
if ([featureName isEqualToString:@"var_1195"]) {
|
||||
return [MLFeatureValue featureValueWithMultiArray:self.var_1195];
|
||||
}
|
||||
return nil;
|
||||
}
|
||||
@ -177,7 +177,7 @@
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
|
||||
if (!outFeatures) { return nil; }
|
||||
return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue];
|
||||
return [[whisper_decoder_implOutput alloc] initWithVar_1195:(MLMultiArray *)[outFeatures featureValueForName:@"var_1195"].multiArrayValue];
|
||||
}
|
||||
|
||||
- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
|
||||
@ -192,7 +192,7 @@
|
||||
NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
|
||||
for (NSInteger i = 0; i < outBatch.count; i++) {
|
||||
id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
|
||||
whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue];
|
||||
whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1195:(MLMultiArray *)[resultProvider featureValueForName:@"var_1195"].multiArrayValue];
|
||||
[results addObject:result];
|
||||
}
|
||||
return results;
|
||||
|
@ -53,11 +53,9 @@ void whisper_coreml_encode(
|
||||
error: nil
|
||||
];
|
||||
|
||||
@autoreleasepool {
|
||||
whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
|
||||
whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
|
||||
|
||||
memcpy(out, outCoreML.output.dataPointer, outCoreML.output.count * sizeof(float));
|
||||
}
|
||||
memcpy(out, outCoreML.output.dataPointer, outCoreML.output.count * sizeof(float));
|
||||
}
|
||||
|
||||
#if __cplusplus
|
||||
|
@ -69,5 +69,4 @@ else()
|
||||
add_subdirectory(quantize)
|
||||
add_subdirectory(talk)
|
||||
add_subdirectory(talk-llama)
|
||||
add_subdirectory(lsp)
|
||||
endif()
|
||||
|
@ -1,9 +0,0 @@
|
||||
if (WHISPER_SDL2)
|
||||
# stream
|
||||
set(TARGET lsp)
|
||||
add_executable(${TARGET} lsp.cpp)
|
||||
|
||||
include(DefaultTargetOptions)
|
||||
|
||||
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
|
||||
endif ()
|
@ -1,104 +0,0 @@
|
||||
# Language Server
|
||||
|
||||
This example consists of a simple language server to expose both unguided
|
||||
and guided (command) transcriptions by sending json messages over stdout/stdin
|
||||
as well as a rather robust vim plugin that makes use of the language server.
|
||||
|
||||
## Vim plugin quick start
|
||||
|
||||
Compile the language server with
|
||||
|
||||
```bash
|
||||
make lsp
|
||||
```
|
||||
Install the plugin itself by copying or symlinking whisper.vim into ~/.vim/autoload/
|
||||
|
||||
In your vimrc, set the path of your whisper.cpp directory and optionally add some keybinds.
|
||||
|
||||
```vim
|
||||
let g:whisper_dir = "~/whisper.cpp"
|
||||
" Start listening for commands when Ctrl - g is pressed in normal mode
|
||||
nnoremap <C-G> call whisper#requestCommands()<CR>
|
||||
" Start unguided transcription when Ctrl - g is pressed in insert mode
|
||||
inoremap <C-G> <Cmd>call whisper#doTranscription()<CR>
|
||||
```
|
||||
|
||||
## Vim plugin usage
|
||||
|
||||
The vim plugin was designed to closely follow the mnemonics of vim
|
||||
|
||||
`s:spoken_dict` is used to translate keys to their spoken form.
|
||||
|
||||
|
||||
Keys corresponding to a string use that spoken value normally and when a motion is expected, but use the key itself when a character is expected.
|
||||
Keys corresponding to a dict, like `i`, can have manual difinitions given to each possible commandset.
|
||||
|
||||
0 is normal (insert), 1 is motion (inside), 2 is it's usage as a single key ([till] i), and 3 is it's usage in an area selection (s -> [around] sentence)
|
||||
|
||||
Some punctuation items, like `-` are explicitly given pronunciations to prevent them from being picked as punctuation instead of an actual command word.
|
||||
|
||||
Not all commands will tokenize to a single token and this can interfere with interpretation. "yank" as an example, takes multiple tokens and correspondingly, will give more accurate detection when only the first "ya" is used. While it could be changed to something else that is a single token (copy), value was placed on maintaining vim mnemonics.
|
||||
|
||||
Commands that would normally move the editor into insert mode (insert, append, open, change) will begin unguided transcription.
|
||||
Unguided transcription will end when a speech segment ends in exit.
|
||||
Presence of punctuation can be designated by whether or not you add a pause between the previous speech segment and exit.
|
||||
Exiting only occurs if exit is the last word, so "Take the first exit on your right" would not cause transcription to end.
|
||||
|
||||
After a command is evaluated, the plugin will continue listening for the next command.
|
||||
|
||||
While in command mode, "Exit" will end listening.
|
||||
|
||||
A best effort approach is taken to keep track of audio that is recorded while a previous chunk is still processing and immediately interpret it afterwards, but the current voice detection still needs a fairly sizable gap to determine when a command has been spoken.
|
||||
|
||||
Log information is sent to a special `whisper_log` buffer and can be accessed with
|
||||
```vim
|
||||
:e whisper_log
|
||||
```
|
||||
|
||||
## Vim plugin configuration
|
||||
|
||||
`g:whisper_dir`
|
||||
A full path to the whisper.cpp repo. It can be expanded in the definition like so:
|
||||
```vim
|
||||
let g:whisper_dir = expand("~/whisper.cpp/")
|
||||
```
|
||||
(The WHISPER_CPP_HOME environment variable is also checked for users of the existing whisper.nvim script)
|
||||
|
||||
`g:whisper_lsp_path`
|
||||
Can be used to manually set the path to the language server.
|
||||
If not defined, it will be inferred from the above whisper_dir
|
||||
|
||||
`g:whisper_model_path`
|
||||
A full path to the model to load. If not defined, it will default to ggml-base.en.bin
|
||||
|
||||
`g:whisper_user_commands`
|
||||
A dictionary of spoken commands that correspond to either strings or funcrefs.
|
||||
This can be used to create connections with other user plugins, for example
|
||||
```vim
|
||||
let g:whisper_user_commands = {"gen": "llama#doLlamaGen"}
|
||||
```
|
||||
will trigger the llama.cpp plugin to begin generation when "gen" is spoken
|
||||
|
||||
## Language server methods
|
||||
|
||||
`registerCommandset`
|
||||
`params` is a list of strings that should be checked for with this commandset. The server prepends a space to these strings before tokenizing.
|
||||
Responds with
|
||||
`result.index` an integer index for the commandset registered, which should be included when initiating a guided transcription to select this commandset.
|
||||
Will return an error if any of the commands in the commandset have duplicate tokenizations
|
||||
|
||||
`guided`
|
||||
`params.commandset_index` An index returned by a corresponding commandset registration. If not set, the most recently registered commandset is used.
|
||||
`params.timestamp` A positive unsigned integer which designates a point in time which audio should begin processing from. If left blank, the start point of audio processing will be the moment the message is recieved. This should be left blank unless you have a timestamp from a previous response.
|
||||
Responds with
|
||||
`result.command_index` The numerical index (starting from 0) of the detected command in the selected commandset
|
||||
`result.command_text` A string containing the command as provided in the commandset
|
||||
`result.timestamp` A positive unsigned integer that designates the point in time which audio stopped being processed at. Pass this timestamp back in a subsequent message to mask the latency of transcription.
|
||||
|
||||
`unguided`
|
||||
`params.no_context` Sets the corresponding whisper `no_context` param. Defaults to true. Might provide more accurate results for consecutive unguided transcriptions if those after the first are set to false.
|
||||
`params.prompt` If provided, sets the initial prompt used during transcription.
|
||||
`params.timestamp` A positive unsigned integer which designates a point in time which audio should begin processing from. If left blank, the start point of audio processing will be the moment the message is recieved. This should be left blank unless you have a timestamp from a previous response.
|
||||
Responds with
|
||||
`result.transcription` A string containing the transcribed text. N.B. This will almost always start with a space due to how text is tokenized.
|
||||
`result.timestamp` A positive unsigned integer that designates the point in time which audio stopped being processed at. Pass this timestamp back in a subsequent message to mask the latency of transcription.
|
24596
examples/lsp/json.hpp
24596
examples/lsp/json.hpp
File diff suppressed because it is too large
Load Diff
@ -1,458 +0,0 @@
|
||||
#include "common.h"
|
||||
#include "common-sdl.h"
|
||||
#include "whisper.h"
|
||||
#include "json.hpp"
|
||||
|
||||
#include <iostream>
|
||||
#include <cassert>
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <thread>
|
||||
#include <vector>
|
||||
#include <deque>
|
||||
#include <set>
|
||||
|
||||
using json = nlohmann::json;
|
||||
|
||||
// command-line parameters
|
||||
struct whisper_params {
|
||||
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||
int32_t prompt_ms = 5000;
|
||||
int32_t command_ms = 8000;
|
||||
int32_t capture_id = -1;
|
||||
int32_t max_tokens = 32;
|
||||
int32_t audio_ctx = 0;
|
||||
|
||||
float vad_thold = 0.6f;
|
||||
float freq_thold = 100.0f;
|
||||
|
||||
bool speed_up = false;
|
||||
bool translate = false;
|
||||
bool print_special = false;
|
||||
bool print_energy = false;
|
||||
|
||||
std::string language = "en";
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
};
|
||||
struct command {
|
||||
std::vector<whisper_token> tokens;
|
||||
std::string plaintext;
|
||||
};
|
||||
struct commandset {
|
||||
std::vector<struct command> commands;
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
// TODO: Store longest command?
|
||||
// Multi-token commands should have probabilities of subsequent logits
|
||||
// given that the prior logit is correct.
|
||||
// In this case, all commands must be iterated.
|
||||
// This however, is likely highly involved as different tokens
|
||||
// almost certainly have different spoken lengths
|
||||
// It would also have performance implications equivalent to a beam search
|
||||
};
|
||||
|
||||
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
|
||||
|
||||
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
for (int i = 1; i < argc; i++) {
|
||||
std::string arg = argv[i];
|
||||
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
|
||||
else if (arg == "-pms" || arg == "--prompt-ms") { params.prompt_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-cms" || arg == "--command-ms") { params.command_ms = std::stoi(argv[++i]); }
|
||||
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
|
||||
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
|
||||
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
|
||||
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
|
||||
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
|
||||
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||
fprintf(stderr, "\n");
|
||||
fprintf(stderr, "options:\n");
|
||||
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
|
||||
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
|
||||
fprintf(stderr, " -pms N, --prompt-ms N [%-7d] prompt duration in milliseconds\n", params.prompt_ms);
|
||||
fprintf(stderr, " -cms N, --command-ms N [%-7d] command duration in milliseconds\n", params.command_ms);
|
||||
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
|
||||
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
|
||||
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
|
||||
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
|
||||
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
|
||||
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
|
||||
using namespace std::chrono;
|
||||
uint64_t time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
|
||||
uint64_t start_time = time_now;
|
||||
if (jparams.contains("timestamp")) {
|
||||
start_time = jparams.at("timestamp");
|
||||
}
|
||||
if(time_now - start_time < 500) {
|
||||
//wait for a backlog of audio
|
||||
std::this_thread::sleep_for(milliseconds(500 - (time_now - start_time)));
|
||||
time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
|
||||
} else if (time_now - start_time > 1000) {
|
||||
audio.get(time_now-start_time, pcmf32);
|
||||
size_t max_offset = pcmf32.size() - WHISPER_SAMPLE_RATE;
|
||||
for(size_t offset=0;offset < max_offset;offset+=WHISPER_SAMPLE_RATE/10) {
|
||||
std::vector<float> audio_chunk(&pcmf32[offset], &pcmf32[offset+WHISPER_SAMPLE_RATE]);
|
||||
if(::vad_simple(audio_chunk, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
|
||||
pcmf32.resize(offset+WHISPER_SAMPLE_RATE);
|
||||
if (offset*1000/WHISPER_SAMPLE_RATE+1000 > maxlength_ms) {
|
||||
//remove samples from the beginning
|
||||
pcmf32.erase(pcmf32.begin(),pcmf32.end()-(maxlength_ms*WHISPER_SAMPLE_RATE/1000));
|
||||
fprintf(stderr, "Shortened samples");
|
||||
}
|
||||
return start_time + offset*1000/WHISPER_SAMPLE_RATE+1000;
|
||||
}
|
||||
}
|
||||
}
|
||||
size_t window_duration = std::max((uint64_t)1000, time_now-start_time);
|
||||
audio.get(window_duration, pcmf32);
|
||||
while (!::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
|
||||
std::this_thread::sleep_for(milliseconds(100));
|
||||
time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
|
||||
window_duration = std::max((uint64_t)1000,time_now-start_time);
|
||||
audio.get(window_duration, pcmf32);
|
||||
}
|
||||
if (time_now - start_time > maxlength_ms) {
|
||||
audio.get(maxlength_ms, pcmf32);
|
||||
} else {
|
||||
audio.get(time_now - start_time, pcmf32);
|
||||
}
|
||||
|
||||
return time_now;
|
||||
}
|
||||
|
||||
json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params ¶ms) {
|
||||
std::vector<whisper_token> prompt_tokens;
|
||||
std::vector<float> pcmf32;
|
||||
uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 10000U, pcmf32);
|
||||
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
if (jparams.contains("prompt")) {
|
||||
// unlikely to see much use. Under normal circumstances, no_context would be set to false
|
||||
std::string prompt = jparams.at("prompt");
|
||||
prompt_tokens.resize(1024);
|
||||
int n = whisper_tokenize(ctx, prompt.c_str(), prompt_tokens.data(), 1024);
|
||||
prompt_tokens.resize(n);
|
||||
|
||||
wparams.prompt_tokens = prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = prompt_tokens.size();
|
||||
}
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = false;
|
||||
wparams.translate = params.translate;
|
||||
wparams.no_context = jparams.value("no_context", true);
|
||||
wparams.single_segment = true;
|
||||
wparams.max_tokens = params.max_tokens;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
wparams.speed_up = params.speed_up;
|
||||
wparams.suppress_non_speech_tokens = true;
|
||||
// run the transformer and a single decoding pass
|
||||
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||
fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
|
||||
throw json{
|
||||
{"code", -32803},
|
||||
{"message", "ERROR: whisper_full() failed"}
|
||||
};
|
||||
}
|
||||
std::string result = whisper_full_get_segment_text(ctx,0);
|
||||
return json {
|
||||
{"transcription", result},
|
||||
{"timestamp", unprocessed_audio_timestamp}
|
||||
};
|
||||
}
|
||||
|
||||
// command-list mode
|
||||
// guide the transcription to match the most likely command from a provided list
|
||||
json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params ¶ms, json jparams, std::vector<struct commandset> commandset_list) {
|
||||
struct commandset cs = commandset_list[jparams.value("commandset_index", commandset_list.size()-1)];
|
||||
std::vector<float> pcmf32;
|
||||
uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 2000U, pcmf32);
|
||||
|
||||
fprintf(stderr, "%s: Speech detected! Processing ...\n", __func__);
|
||||
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
|
||||
|
||||
wparams.print_progress = false;
|
||||
wparams.print_special = params.print_special;
|
||||
wparams.print_realtime = false;
|
||||
wparams.print_timestamps = false;
|
||||
wparams.translate = params.translate;
|
||||
wparams.no_context = true;
|
||||
wparams.single_segment = true;
|
||||
wparams.max_tokens = 1;
|
||||
wparams.language = params.language.c_str();
|
||||
wparams.n_threads = params.n_threads;
|
||||
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
// TODO: Do some time testing. Does an overly long prompt slow down processing?
|
||||
// Set up command sets/precompute prompts
|
||||
wparams.prompt_tokens = cs.prompt_tokens.data();
|
||||
wparams.prompt_n_tokens = cs.prompt_tokens.size();
|
||||
// TODO: properly expose as option
|
||||
wparams.suppress_non_speech_tokens = true;
|
||||
|
||||
// run the transformer and a single decoding pass
|
||||
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
|
||||
fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
|
||||
throw json{
|
||||
{"code", -32803},
|
||||
{"message", "ERROR: whisper_full() failed"}//TODO: format string (sprintf?)
|
||||
};
|
||||
}
|
||||
|
||||
// estimate command probability
|
||||
// NOTE: not optimal
|
||||
{
|
||||
const auto * logits = whisper_get_logits(ctx);
|
||||
|
||||
std::vector<float> probs(whisper_n_vocab(ctx), 0.0f);
|
||||
|
||||
// compute probs from logits via softmax
|
||||
{
|
||||
float max = -1e9;
|
||||
for (int i = 0; i < (int) probs.size(); ++i) {
|
||||
max = std::max(max, logits[i]);
|
||||
}
|
||||
|
||||
float sum = 0.0f;
|
||||
for (int i = 0; i < (int) probs.size(); ++i) {
|
||||
probs[i] = expf(logits[i] - max);
|
||||
sum += probs[i];
|
||||
}
|
||||
|
||||
for (int i = 0; i < (int) probs.size(); ++i) {
|
||||
probs[i] /= sum;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<float, int>> probs_id;
|
||||
|
||||
// In my testing, the most verbose token is always the desired.
|
||||
// TODO: Trim commandset struct once efficacy has been verified
|
||||
for (int i = 0; i < (int) cs.commands.size(); ++i) {
|
||||
probs_id.emplace_back(probs[cs.commands[i].tokens[0]], i);
|
||||
}
|
||||
|
||||
// sort descending
|
||||
{
|
||||
using pair_type = decltype(probs_id)::value_type;
|
||||
std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
|
||||
return a.first > b.first;
|
||||
});
|
||||
}
|
||||
int id = probs_id[0].second;
|
||||
return json{
|
||||
{"command_index", id},
|
||||
{"command_text", cs.commands[id].plaintext},
|
||||
{"timestamp", unprocessed_audio_timestamp},
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
|
||||
// TODO: check for token collision
|
||||
struct commandset cs;
|
||||
|
||||
std::string k_prompt = " select one from the available words: ";
|
||||
std::set<whisper_token> token_set;
|
||||
whisper_token tokens[32];
|
||||
for (std::string s : jparams) {
|
||||
std::vector<whisper_token> token_vec;
|
||||
// The existing command implementation uses a nested for loop to tokenize single characters
|
||||
// I fail to see the purpose of this when ' a' has a wholly different pronunciation than the start of ' apple'
|
||||
const int n = whisper_tokenize(ctx, (" " + s).c_str(), tokens, 32);
|
||||
if (n < 0) {
|
||||
fprintf(stderr, "%s: error: failed to tokenize command '%s'\n", __func__, s.c_str());
|
||||
return 3;
|
||||
}
|
||||
token_vec.push_back(tokens[0]);
|
||||
if (!token_set.insert(tokens[0]).second) {
|
||||
fprintf(stderr, "%s: warning: %s is a duplicate of an existing token\n", __func__, s.c_str());
|
||||
throw json{
|
||||
{"code",-31000},
|
||||
{"message", "Duplicate token in token set: " + s}
|
||||
};
|
||||
}
|
||||
if (n > 1) {// empty string if n=0? Should never occur
|
||||
fprintf(stderr, "%s: error: command is more than a single token: %s\n", __func__, s.c_str());
|
||||
}
|
||||
struct command command = {token_vec, s};
|
||||
cs.commands.push_back(command);
|
||||
k_prompt += s;
|
||||
}
|
||||
k_prompt = k_prompt.substr(0,k_prompt.length()-2) + ". Selected word:";
|
||||
cs.prompt_tokens.resize(1024);
|
||||
int n = whisper_tokenize(ctx, k_prompt.c_str(), cs.prompt_tokens.data(), 1024);
|
||||
cs.prompt_tokens.resize(n);
|
||||
// prepare response
|
||||
int index = commandset_list.size();
|
||||
commandset_list.push_back(cs);
|
||||
return json{{"index",index}};
|
||||
}
|
||||
json seek(struct whisper_context * ctx, audio_async &audio, json params) {
|
||||
// whisper_state has the pertinent offsets, but there also seem to be a large
|
||||
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
|
||||
// I'll give this a another pass once everything else is implemented,
|
||||
// but for now, it's unsupported
|
||||
throw json{
|
||||
{"code", -32601},
|
||||
{"message", "Seeking is not yet supported."}
|
||||
};
|
||||
}
|
||||
json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params ¶ms, std::vector<struct commandset> &commandset_list) {
|
||||
// See: https://www.jsonrpc.org/specification
|
||||
json id = body.at("id");
|
||||
try {
|
||||
std::string version = body.at("jsonrpc");
|
||||
if (version != "2.0") {
|
||||
// unsupported version
|
||||
throw json{
|
||||
{"code", -3260},
|
||||
{"message", "invalid jsonrpc version"}
|
||||
};
|
||||
}
|
||||
std::string method = body.at("method");
|
||||
json jparams = json{{"dummy", "dummy"}};
|
||||
if (body.contains("params"))
|
||||
jparams = body.at("params");
|
||||
json res;
|
||||
// TODO: be consistent about argument order
|
||||
fprintf(stderr, "Dispatching a job\n");
|
||||
if (method == "unguided") { res = unguided_transcription(ctx, audio, jparams, params); }
|
||||
else if (method == "guided") { res = guided_transcription(ctx, audio, params, jparams, commandset_list); }
|
||||
else if (method == "seek") { res = seek(ctx, audio, jparams); }
|
||||
else if (method == "registerCommandset") { res = register_commandset(ctx, jparams, commandset_list); }
|
||||
else if (method == "echo") { res = jparams; }
|
||||
|
||||
|
||||
return json{
|
||||
{"jsonrpc", "2.0"},
|
||||
{"result", res},
|
||||
{"id", id}
|
||||
};
|
||||
} catch(json ex) {
|
||||
return json {
|
||||
{"jsonrpc", "2.0"},
|
||||
{"error", ex},
|
||||
{"id", id}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params ¶ms) {
|
||||
std::deque<json> jobqueue;
|
||||
std::vector<struct commandset> commandset_list;
|
||||
while (true) {
|
||||
// For eventual cancellation support, shouldn't block if job exists
|
||||
if (std::cin.rdbuf()->in_avail() > 22 || jobqueue.size() == 0) {
|
||||
int content_length;
|
||||
if (scanf("Content-Length: %d", &content_length) != 1) {
|
||||
fprintf(stderr, "Could not read input: %d", std::cin.peek());
|
||||
return;
|
||||
}
|
||||
// scanf leaves the new lines intact
|
||||
std::cin.ignore(2);
|
||||
if (std::cin.peek() != 13) {
|
||||
// Content-Type. jsonrpc necessitates utf8.
|
||||
std::cin.ignore(200,10);
|
||||
}
|
||||
std::cin.ignore(2);
|
||||
// A message is being sent and blocking is acceptable
|
||||
std::string content(content_length,'\0');
|
||||
std::cin.read(&content[0], content_length);
|
||||
json job = json::parse(content);
|
||||
// TODO: Some messages(cancellation) should skip queue here
|
||||
if (job.is_array()) {
|
||||
// response must also be batched. Will implement later
|
||||
// for (subjob : job.begin())
|
||||
// TODO: At the very least respond with an unsupported error.
|
||||
} else {
|
||||
jobqueue.push_back(job);
|
||||
}
|
||||
}
|
||||
assert(jobqueue.size() > 0);
|
||||
json job = jobqueue.front();
|
||||
json resp = parse_job(job, ctx, audio, params, commandset_list);
|
||||
if (resp != "unfinished") {
|
||||
jobqueue.pop_front();
|
||||
// send response
|
||||
std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
|
||||
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
|
||||
std::cout.flush();
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main(int argc, char ** argv) {
|
||||
whisper_params params;
|
||||
if (whisper_params_parse(argc, argv, params) == false) {
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (whisper_lang_id(params.language.c_str()) == -1) {
|
||||
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
exit(0);
|
||||
}
|
||||
|
||||
// whisper init
|
||||
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
|
||||
// init audio
|
||||
|
||||
audio_async audio(30*1000);
|
||||
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
|
||||
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
|
||||
audio.resume();
|
||||
// TODO: Investigate why this is required. An extra second of startup latency is not great
|
||||
// wait for 1 second to avoid any buffered noise
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
|
||||
audio.clear();
|
||||
// TODO: consider some sort of indicator to designate loading has finished?
|
||||
// Potentially better for the client to just start with a non-blocking message (register commands)
|
||||
process_loop(ctx, audio, params);
|
||||
|
||||
audio.pause();
|
||||
whisper_print_timings(ctx);
|
||||
whisper_free(ctx);
|
||||
|
||||
return 0;
|
||||
}
|
@ -1,362 +0,0 @@
|
||||
if !exists("g:whisper_dir")
|
||||
let g:whisper_dir = expand($WHISPER_CPP_HOME)
|
||||
if g:whisper_dir == ""
|
||||
echoerr "Please provide a path to the whisper.cpp repo in either the $WHISPER_CPP_HOME environment variable, or g:whisper_dir"
|
||||
endif
|
||||
endif
|
||||
if !exists("g:whisper_lsp_path")
|
||||
let g:whisper_lsp_path = g:whisper_dir .. "lsp"
|
||||
if !filereadable(g:whisper_lsp_path)
|
||||
echoerr "Was not able to locate a lsp executable at: " .. g:whisper_lsp_path
|
||||
throw "Executable not found"
|
||||
endif
|
||||
endif
|
||||
if !exists("g:whisper_model_path")
|
||||
" TODO: allow custom paths relative to the repo dir
|
||||
let g:whisper_model_path = g:whisper_dir .. "models/ggml-base.en.bin"
|
||||
if !filereadable(g:whisper_model_path)
|
||||
echoerr "Could not find model at: " .. g:whisper_model_path
|
||||
throw "Model not found"
|
||||
endif
|
||||
endif
|
||||
let s:output_buffer = bufnr("whisper_log", v:true)
|
||||
call setbufvar(s:output_buffer,"&buftype","nofile")
|
||||
let s:lsp_command = [g:whisper_lsp_path,"-m",g:whisper_model_path]
|
||||
" For faster execution. TODO: server load multiple models/run multiple servers?
|
||||
" let s:lsp_command = [g:whisper_lsp_path, "-m", g:whisper_dir .. "models/ggml-tiny.en.bin", "-ac", "128"]
|
||||
|
||||
" requestCommands([params_dict])
|
||||
func whisper#requestCommands(...)
|
||||
let l:req = {"method": "guided", "params": {"commandset_index": 0}}
|
||||
if a:0 > 0
|
||||
call extend(l:req.params, a:1)
|
||||
endif
|
||||
let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback", [l:req.params, 0])})
|
||||
endfunction
|
||||
|
||||
" doTranscription([params_dict])
|
||||
func whisper#doTranscription(...)
|
||||
let l:req = {"method": "unguided", "params": {}}
|
||||
if a:0 > 0
|
||||
call extend(l:req.params, a:1)
|
||||
endif
|
||||
let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:transcriptionCallback", [function("s:insertText"),function("s:endTranscription")])})
|
||||
endfunction
|
||||
|
||||
" For testing
|
||||
func whisper#uppertest(cha)
|
||||
echo tr(a:cha, s:c_lowerkeys, s:c_upperkeys)
|
||||
endfunction
|
||||
|
||||
|
||||
" (upper, exit, count, motion, command, insert/append, save run) "base"
|
||||
" (upper, exit, count, motion, command, inside/around) "motion/visual"
|
||||
" (upper, exit, count, motion, line, inside/around) "command already entered"
|
||||
" (upper, exit, key, ) "from/till"
|
||||
|
||||
" upper and lower keys is used to translate between cases with tr
|
||||
" Must be sunchronized
|
||||
let s:c_lowerkeys = "1234567890-=qwertyuiop[]\\asdfghjkl;'zxcvbnm,./\""
|
||||
let s:c_upperkeys = "!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?'"
|
||||
let s:c_count = split("1234567890\"",'\zs')
|
||||
let s:c_command = split("ryuogpdxcv.iam", '\zs')
|
||||
let s:c_motion = split("wetf'hjklnb$^)",'\zs')
|
||||
" object words: Word, Sentence, Paragraph, [, (, <, Tag, {. ", '
|
||||
let s:c_area = split("wsp])>t}\"'",'\zs')
|
||||
"Special commands.
|
||||
let s:c_special_always = ["exit", "upper"]
|
||||
let s:c_special_normal = ["save", "run", "space"]
|
||||
|
||||
" If not in dict, key is spoken word,
|
||||
" If key resolves to string, value is used for normal/motion, but key for chars
|
||||
" If key resolves to dict, {0: "normal",1: "motion",2:"single char",3: "area"}
|
||||
" Missing entries fall back as follows {0: "required", 1: 0, 2: "key", 3: 0}
|
||||
let s:spoken_dict = {"w": "word", "e": "end", "r": "replace", "t": {0: "till", 3: "tag"}, "y": "yank", "u": "undo", "i": {0: "insert", 1: "inside"}, "o": "open", "p": {0: "paste", 3: "paragraph"}, "a": {0: "append", 1: "around"}, "s": {0: "substitute", 3: "sentence"}, "d": "delete", "f": "from", "g": "go", "h": "left", "j": "down", "k": "up", "l": "right", "c": "change", "v": "visual", "b": "back", "n": "next", "m": "mark", ".": {0: "repeat", 2: "period"}, "]": {0: "bracket", 2: "bracket"}, "'": {0: "jump", 2: "apostrophe", 3: "apostrophe"}, '"': {0: 'register', 2: "quotation", 3: "quotation"}, "-": {0: "minus", 2: "minus"}, "$": {0: "dollar", 2: "dollar"}, "^": {0: "carrot", 2: "carrot"}, ")": {0: "sentence", 2: "parenthesis", 3: "parenthesis"}, "}": {0: "paragraph", 2: "brace", 3: "brace"}, ">": {0: "indent", 2: "angle", 3: "angle"}}
|
||||
|
||||
" Give this another pass. This seems overly hacky even if it's functional
|
||||
let s:sub_tran_msg = ""
|
||||
func s:subTranProg(msg)
|
||||
if s:sub_tran_msg != ""
|
||||
let s:sub_tran_msg = s:sub_tran_msg .. a:msg
|
||||
if mode() !=? 'v'
|
||||
exe "normal" "u" .. s:sub_tran_msg
|
||||
endif
|
||||
else
|
||||
if s:command_backlog == ""
|
||||
" this should not occur
|
||||
call s:logCallback(0, "Warning: Encountered sub transcription without prior command")
|
||||
let s:command_backlog = "a"
|
||||
endif
|
||||
if a:msg[0] == ' '
|
||||
let s:sub_tran_msg = s:command_backlog .. a:msg[1:-1]
|
||||
else
|
||||
let s:sub_tran_msg = s:command_backlog .. a:msg
|
||||
endif
|
||||
if mode() !=? 'v'
|
||||
exe "normal" s:sub_tran_msg
|
||||
endif
|
||||
endif
|
||||
call appendbufline(s:output_buffer, "$", s:sub_tran_msg .. ":" .. string(a:msg ))
|
||||
endfunction
|
||||
|
||||
func s:subTranFinish(params, timestamp)
|
||||
let s:repeat_command = s:sub_tran_msg
|
||||
" Visual selection is lot if used with streaming, so streaming of partial
|
||||
" transcriptions is disabled in visual mode
|
||||
if mode() ==? 'v'
|
||||
exe "normal" s:sub_tran_msg
|
||||
endif
|
||||
let s:sub_tran_msg = ""
|
||||
let s:command_backlog = ""
|
||||
exe "normal a\<C-G>u"
|
||||
let l:params = a:params
|
||||
let l:params.timestamp = a:timestamp
|
||||
if exists("l:params.commandset_index")
|
||||
unlet l:params.commandset_index
|
||||
endif
|
||||
call whisper#requestCommands(a:params)
|
||||
endfunction
|
||||
|
||||
func s:logCallback(channel, msg)
|
||||
call appendbufline(s:output_buffer,"$",a:msg)
|
||||
endfunction
|
||||
|
||||
|
||||
func s:transcriptionCallback(progressCallback, finishedCallback, channel, msg)
|
||||
let l:tr = a:msg.result.transcription
|
||||
|
||||
let l:ex_ind = match(tolower(l:tr),"exit", len(l:tr)-6)
|
||||
" The worst case I've observed so far is " Exit.", which is 6 characters
|
||||
if l:ex_ind != -1
|
||||
call a:progressCallback(strpart(l:tr,0,l:ex_ind-1))
|
||||
call a:finishedCallback(a:msg.result.timestamp)
|
||||
else
|
||||
call a:progressCallback(l:tr)
|
||||
let req = {"method": "unguided", "params": {"timestamp": a:msg.result.timestamp, "no_context": v:true}}
|
||||
let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [a:progressCallback, a:finishedCallback])})
|
||||
endif
|
||||
endfunc
|
||||
func s:insertText(msg)
|
||||
exe "normal a" .. a:msg
|
||||
endfunction
|
||||
func s:endTranscription(timestamp)
|
||||
call appendbufline(s:output_buffer, "$", "Ending unguided transcription")
|
||||
endfunction
|
||||
|
||||
|
||||
|
||||
" If a command does not include a whole actionable step, attempting to execute
|
||||
" it discards the remainder of things. There is likely a simpler solution,
|
||||
" but it can be made functional now by storing a backbuffer until actionable
|
||||
let s:command_backlog = ""
|
||||
let s:repeat_command = ""
|
||||
let s:preceeding_upper = v:false
|
||||
func s:commandCallback(params, commandset_index, channel, msg)
|
||||
let l:command_index = a:msg.result.command_index
|
||||
let l:do_execute = v:false
|
||||
let l:next_mode = a:commandset_index
|
||||
let l:command = s:commandset_list[a:commandset_index][l:command_index]
|
||||
call s:logCallback(0, string(a:msg) .. " " .. a:commandset_index .. " " .. l:command)
|
||||
if l:command_index == 0
|
||||
"exit
|
||||
"if s:command_backlog == ""
|
||||
call s:logCallback(0,"Stopping command mode")
|
||||
echo "No longer listening"
|
||||
let s:command_backlog = ""
|
||||
return
|
||||
"else
|
||||
" Legacy code to clear an existing buffer with exit.
|
||||
" Was found to be rarely desired and is better introduced as a
|
||||
" standalone command (clear?)
|
||||
" call s:logCallback(0,"Clearing command_backlog" .. s:command_backlog)
|
||||
" let s:command_backlog = ""
|
||||
" let s:preceeding_upper = v:false
|
||||
" endif
|
||||
elseif l:command_index == 1
|
||||
" upper
|
||||
let s:preceeding_upper = !s:preceeding_upper
|
||||
elseif l:command == "save"
|
||||
" save and run can only happen in commandset 0,
|
||||
exe "w"
|
||||
elseif l:command == "run"
|
||||
exe "make run"
|
||||
elseif l:command == "space"
|
||||
exe "normal i \<ESC>l"
|
||||
elseif has_key(s:c_user, l:command)
|
||||
let Userfunc = s:c_user[l:command]
|
||||
if type(Userfunc) == v:t_string
|
||||
let Userfunc = function(Userfunc)
|
||||
endif
|
||||
call Userfunc()
|
||||
else
|
||||
if s:preceeding_upper
|
||||
" Upper should keep commandset
|
||||
let s:preceeding_upper = v:false
|
||||
let l:visual_command = tr(l:command, s:c_lowerkeys, s:c_upperkeys)
|
||||
else
|
||||
let l:visual_command = l:command
|
||||
endif
|
||||
echo s:command_backlog .. " - " .. l:visual_command
|
||||
let s:command_backlog = s:command_backlog .. l:visual_command
|
||||
if a:commandset_index == 2 || a:commandset_index == 3
|
||||
" single key, either completes motion, replace, or register
|
||||
" Should move to execute unless part of a register
|
||||
" Change will be caught at execute
|
||||
if s:command_backlog[-2:-2] !=# '"'
|
||||
call s:logCallback(0,"not register")
|
||||
let l:do_execute = v:true
|
||||
end
|
||||
let l:next_mode = 0
|
||||
" commandset index only matters for a/i
|
||||
elseif (l:command == "a" || l:command == "i") && a:commandset_index == 1
|
||||
" inside/around. Is commandset 3
|
||||
let l:next_mode = 3
|
||||
elseif l:command ==# '"'
|
||||
let l:next_mode = 2
|
||||
elseif index(s:c_count, l:command) != -1
|
||||
let l:next_mode = a:commandset_index
|
||||
elseif index(s:c_motion, l:command) != -1
|
||||
if l:command == 't' || l:command == 'f' || l:command == "'"
|
||||
" prompt single key
|
||||
let l:next_mode = 2
|
||||
else
|
||||
let l:do_execute = v:true
|
||||
let l:next_mode = 0
|
||||
endif
|
||||
elseif index(s:c_command, l:command) != -1
|
||||
if index(["y","g","d","c"], s:command_backlog[-1:-1]) != -1 && s:command_backlog[-1:-1] != s:command_backlog[-2:-2] && mode() !=? 'v'
|
||||
" need motion or repeated command
|
||||
" Potential for bad state here if disparaging command keys are
|
||||
" entered (i.e. yd), but vim can handle checks for this at exe
|
||||
" And checking for cases like y123d would complicate things
|
||||
let l:next_mode = 1
|
||||
elseif index(["i","a","c", "o", "s"], l:command) != -1 || s:command_backlog[-1:-1] ==# 'R'
|
||||
"'Insert' mode, do general transcription
|
||||
let l:req = {"method": "unguided", "params": a:params}
|
||||
let l:req.params.timestamp = a:msg.result.timestamp
|
||||
let l:req.params.no_context = v:true
|
||||
let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
|
||||
return
|
||||
elseif l:command == 'r' || l:command == 'm'
|
||||
let l:next_mode = 2
|
||||
elseif l:command == '.'
|
||||
let l:next_mode = 0
|
||||
let l:do_execute = v:true
|
||||
let s:command_backlog = s:command_backlog[0:-2] .. s:repeat_command
|
||||
else
|
||||
if l:command ==? 'v'
|
||||
let l:next_mode = 1
|
||||
else
|
||||
let l:next_mode = 0
|
||||
endif
|
||||
let l:do_execute = v:true
|
||||
endif
|
||||
else
|
||||
throw "Invalid command state: " .. l:command .. " " .. a:commandset_index .. " " .. s:command_backlog
|
||||
endif
|
||||
endif
|
||||
if l:do_execute
|
||||
if mode() ==?'v' && l:next_mode == 0
|
||||
let l:next_mode = 1
|
||||
elseif match(s:command_backlog, 'c') != -1
|
||||
let l:req = {"method": "unguided", "params": a:params}
|
||||
let l:req.params.timestamp = a:msg.result.timestamp
|
||||
let l:req.params.no_context = v:true
|
||||
let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
|
||||
return
|
||||
endif
|
||||
exe "normal" s:command_backlog
|
||||
if index(s:c_motion + ["u"],l:command) == -1
|
||||
exe "normal a\<C-G>u"
|
||||
let s:repeat_command = s:command_backlog
|
||||
call s:logCallback(0, s:command_backlog)
|
||||
endif
|
||||
let s:command_backlog = ""
|
||||
endif
|
||||
let l:req = {"method": "guided", "params": a:params}
|
||||
let l:req.params.timestamp = a:msg.result.timestamp
|
||||
let l:req.params.commandset_index = l:next_mode
|
||||
let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback",[a:params, l:next_mode])})
|
||||
endfunction
|
||||
|
||||
func s:loadedCallback(channel, msg)
|
||||
echo "Loading complete"
|
||||
call s:logCallback(a:channel, a:msg)
|
||||
endfunction
|
||||
|
||||
func s:registerCommandset(commandlist, is_final)
|
||||
let req = {"method": "registerCommandset"}
|
||||
let req.params = a:commandlist
|
||||
call s:logCallback(0, join(a:commandlist))
|
||||
call add(g:whisper_commandlist_spoken, a:commandlist)
|
||||
if a:is_final
|
||||
let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:loadedCallback"})
|
||||
else
|
||||
let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:logCallback"})
|
||||
endif
|
||||
endfunction
|
||||
|
||||
func s:registerAllCommands()
|
||||
let l:normal = s:c_special_always + s:c_special_normal + s:c_count + s:c_command + s:c_motion + keys(s:c_user)
|
||||
let l:visual = s:c_special_always + s:c_count + s:c_command + s:c_motion
|
||||
" Currently the same as visual.
|
||||
" let l:post_command = s:c_special_always + s:c_count + s:c_command + s:c_motion
|
||||
let l:single_key = s:c_special_always + split(s:c_lowerkeys, '\zs')
|
||||
let l:area = s:c_special_always + s:c_area
|
||||
|
||||
" Used only for compatibility with the testing script
|
||||
let g:whisper_commandlist_spoken = []
|
||||
|
||||
let s:commandset_list = [l:normal, l:visual, l:single_key, l:area]
|
||||
call s:registerCommandset(s:commandsetToSpoken(l:normal, 0), v:false)
|
||||
call s:registerCommandset(s:commandsetToSpoken(l:visual, 1), v:false)
|
||||
call s:registerCommandset(s:commandsetToSpoken(l:single_key, 2), v:false)
|
||||
call s:registerCommandset(s:commandsetToSpoken(l:area, 3), v:true)
|
||||
endfunction
|
||||
|
||||
func s:commandsetToSpoken(commandset, spoken_index)
|
||||
let l:spoken_list = []
|
||||
for l:command in a:commandset
|
||||
if has_key(s:spoken_dict, l:command)
|
||||
let l:spoken_value = s:spoken_dict[l:command]
|
||||
if type(l:spoken_value) == v:t_dict
|
||||
if has_key(l:spoken_value, a:spoken_index)
|
||||
let l:spoken_value = l:spoken_value[a:spoken_index]
|
||||
else
|
||||
if a:spoken_index == 2
|
||||
let l:spoken_value = l:command
|
||||
else
|
||||
let l:spoken_value = l:spoken_value[0]
|
||||
endif
|
||||
endif
|
||||
else
|
||||
if a:spoken_index == 2
|
||||
let l:spoken_value = l:command
|
||||
endif
|
||||
endif
|
||||
else
|
||||
let l:spoken_value = l:command
|
||||
endif
|
||||
call add(l:spoken_list, l:spoken_value)
|
||||
endfor
|
||||
return l:spoken_list
|
||||
endfunction
|
||||
|
||||
" TODO: Check lifetime. If the script is resourced, is the existing
|
||||
" s:lsp_job dropped and therefore killed?
|
||||
" This seems to not be the case and I've had to deal with zombie processes
|
||||
" that survive exiting vim, even though said behavior conflicts with my
|
||||
" understanding of the provided documentation
|
||||
let s:lsp_opts = {"in_mode": "lsp", "out_mode": "lsp", "err_mode": "nl", "err_io": "buffer", "err_buf": s:output_buffer}
|
||||
if !exists("g:lsp_job")
|
||||
if exists("g:whisper_user_commands")
|
||||
let s:c_user = g:whisper_user_commands
|
||||
else
|
||||
let s:c_user = {}
|
||||
endif
|
||||
let g:lsp_job = job_start(s:lsp_command, s:lsp_opts)
|
||||
if job_status(g:lsp_job) == "fail"
|
||||
echoerr "Failed to start whisper job"
|
||||
endif
|
||||
call s:registerAllCommands()
|
||||
endif
|
@ -59,7 +59,6 @@ struct whisper_params {
|
||||
int32_t offset_t_ms = 0;
|
||||
int32_t offset_n = 0;
|
||||
int32_t duration_ms = 0;
|
||||
int32_t progress_step = 5;
|
||||
int32_t max_context = -1;
|
||||
int32_t max_len = 0;
|
||||
int32_t best_of = 2;
|
||||
@ -70,7 +69,6 @@ struct whisper_params {
|
||||
float logprob_thold = -1.00f;
|
||||
|
||||
bool speed_up = false;
|
||||
bool debug_mode = false;
|
||||
bool translate = false;
|
||||
bool detect_language = false;
|
||||
bool diarize = false;
|
||||
@ -88,7 +86,6 @@ struct whisper_params {
|
||||
bool print_colors = false;
|
||||
bool print_progress = false;
|
||||
bool no_timestamps = false;
|
||||
bool log_score = false;
|
||||
|
||||
std::string language = "en";
|
||||
std::string prompt;
|
||||
@ -136,8 +133,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
|
||||
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
|
||||
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
|
||||
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
|
||||
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
|
||||
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
|
||||
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
|
||||
@ -162,7 +158,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
|
||||
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
|
||||
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
@ -192,8 +187,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
|
||||
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
|
||||
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
|
||||
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
|
||||
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
|
||||
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
|
||||
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
|
||||
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
|
||||
@ -217,7 +211,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
|
||||
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
|
||||
fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
@ -225,7 +218,6 @@ struct whisper_print_user_data {
|
||||
const whisper_params * params;
|
||||
|
||||
const std::vector<std::vector<float>> * pcmf32s;
|
||||
int progress_prev;
|
||||
};
|
||||
|
||||
std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
|
||||
@ -260,14 +252,6 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
|
||||
|
||||
return speaker;
|
||||
}
|
||||
void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
|
||||
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
|
||||
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
|
||||
if (progress >= *progress_prev + progress_step) {
|
||||
*progress_prev += progress_step;
|
||||
fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
|
||||
}
|
||||
}
|
||||
|
||||
void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
|
||||
const auto & params = *((whisper_print_user_data *) user_data)->params;
|
||||
@ -492,25 +476,6 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
|
||||
return true;
|
||||
}
|
||||
|
||||
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
std::ofstream fout(fname);
|
||||
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
|
||||
|
||||
const int n_segments = whisper_full_n_segments(ctx);
|
||||
// fprintf(stderr,"segments: %d\n",n_segments);
|
||||
for (int i = 0; i < n_segments; ++i) {
|
||||
const int n_tokens = whisper_full_n_tokens(ctx, i);
|
||||
// fprintf(stderr,"tokens: %d\n",n_tokens);
|
||||
for (int j = 0; j < n_tokens; j++) {
|
||||
auto token = whisper_full_get_token_text(ctx, i, j);
|
||||
auto probability = whisper_full_get_token_p(ctx, i, j);
|
||||
fout << token << '\t' << probability << std::endl;
|
||||
// fprintf(stderr,"token: %s %f\n",token,probability);
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
|
||||
std::ofstream fout(fname);
|
||||
int indent = 0;
|
||||
@ -918,7 +883,6 @@ int main(int argc, char ** argv) {
|
||||
wparams.split_on_word = params.split_on_word;
|
||||
|
||||
wparams.speed_up = params.speed_up;
|
||||
wparams.debug_mode = params.debug_mode;
|
||||
|
||||
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
||||
|
||||
@ -931,7 +895,7 @@ int main(int argc, char ** argv) {
|
||||
wparams.entropy_thold = params.entropy_thold;
|
||||
wparams.logprob_thold = params.logprob_thold;
|
||||
|
||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
|
||||
whisper_print_user_data user_data = { ¶ms, &pcmf32s };
|
||||
|
||||
// this callback is called on each new segment
|
||||
if (!wparams.print_realtime) {
|
||||
@ -939,11 +903,6 @@ int main(int argc, char ** argv) {
|
||||
wparams.new_segment_callback_user_data = &user_data;
|
||||
}
|
||||
|
||||
if (wparams.print_progress) {
|
||||
wparams.progress_callback = whisper_print_progress_callback;
|
||||
wparams.progress_callback_user_data = &user_data;
|
||||
}
|
||||
|
||||
// example for abort mechanism
|
||||
// in this example, we do not abort the processing, but we could if the flag is set to true
|
||||
// the callback is called before every encoder run - if it returns false, the processing is aborted
|
||||
@ -1008,12 +967,6 @@ int main(int argc, char ** argv) {
|
||||
const auto fname_lrc = fname_out + ".lrc";
|
||||
output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s);
|
||||
}
|
||||
|
||||
// output to score file
|
||||
if (params.log_score) {
|
||||
const auto fname_score = fname_out + ".score.txt";
|
||||
output_score(ctx, fname_score.c_str(), params, pcmf32s);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -138,7 +138,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
|
||||
// return false;
|
||||
//}
|
||||
|
||||
char word[129];
|
||||
char word[128];
|
||||
|
||||
for (int i = 0; i < n_vocab; i++) {
|
||||
uint32_t len;
|
||||
|
@ -47,7 +47,6 @@ struct whisper_params {
|
||||
bool print_special = false;
|
||||
bool no_context = true;
|
||||
bool no_timestamps = false;
|
||||
bool tinydiarize = false;
|
||||
|
||||
std::string language = "en";
|
||||
std::string model = "models/ggml-base.en.bin";
|
||||
@ -81,8 +80,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
|
||||
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
|
||||
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
|
||||
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
|
||||
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
|
||||
|
||||
else {
|
||||
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
|
||||
whisper_print_usage(argc, argv, params);
|
||||
@ -116,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
|
||||
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
|
||||
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
|
||||
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
|
||||
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
|
||||
fprintf(stderr, "\n");
|
||||
}
|
||||
|
||||
@ -303,8 +299,6 @@ int main(int argc, char ** argv) {
|
||||
wparams.audio_ctx = params.audio_ctx;
|
||||
wparams.speed_up = params.speed_up;
|
||||
|
||||
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
|
||||
|
||||
// disable temperature fallback
|
||||
//wparams.temperature_inc = -1.0f;
|
||||
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
|
||||
@ -350,19 +344,10 @@ int main(int argc, char ** argv) {
|
||||
const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
|
||||
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
|
||||
|
||||
std::string output = "[" + to_timestamp(t0) + " --> " + to_timestamp(t1) + "] " + text;
|
||||
|
||||
if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
|
||||
output += " [SPEAKER_TURN]";
|
||||
}
|
||||
|
||||
output += "\n";
|
||||
|
||||
printf("%s", output.c_str());
|
||||
fflush(stdout);
|
||||
printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
|
||||
|
||||
if (params.fname_out.length() > 0) {
|
||||
fout << output;
|
||||
fout << "[" << to_timestamp(t0) << " --> " << to_timestamp(t1) << "] " << text << std::endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -18,9 +18,6 @@ android {
|
||||
vectorDrawables {
|
||||
useSupportLibrary true
|
||||
}
|
||||
ndk {
|
||||
abiFilters 'arm64-v8a', 'armeabi-v7a', 'x86', 'x86_64'
|
||||
}
|
||||
}
|
||||
|
||||
buildTypes {
|
||||
@ -45,8 +42,8 @@ android {
|
||||
}
|
||||
ndkVersion "25.1.8937393"
|
||||
externalNativeBuild {
|
||||
cmake {
|
||||
path = file("src/main/jni/whisper/CMakeLists.txt")
|
||||
ndkBuild {
|
||||
path 'src/main/jni/whisper/Android.mk'
|
||||
}
|
||||
}
|
||||
packagingOptions {
|
||||
|
26
examples/whisper.android/app/src/main/jni/whisper/Android.mk
Normal file
26
examples/whisper.android/app/src/main/jni/whisper/Android.mk
Normal file
@ -0,0 +1,26 @@
|
||||
LOCAL_PATH := $(call my-dir)
|
||||
include $(CLEAR_VARS)
|
||||
LOCAL_MODULE := libwhisper
|
||||
include $(LOCAL_PATH)/Whisper.mk
|
||||
include $(BUILD_SHARED_LIBRARY)
|
||||
|
||||
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
|
||||
include $(CLEAR_VARS)
|
||||
LOCAL_MODULE := libwhisper_vfpv4
|
||||
include $(LOCAL_PATH)/Whisper.mk
|
||||
# Allow building NEON FMA code.
|
||||
# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
|
||||
LOCAL_CFLAGS += -mfpu=neon-vfpv4
|
||||
include $(BUILD_SHARED_LIBRARY)
|
||||
endif
|
||||
|
||||
ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
|
||||
include $(CLEAR_VARS)
|
||||
LOCAL_MODULE := libwhisper_v8fp16_va
|
||||
include $(LOCAL_PATH)/Whisper.mk
|
||||
# Allow building NEON FMA code.
|
||||
# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
|
||||
LOCAL_CFLAGS += -march=armv8.2-a+fp16
|
||||
include $(BUILD_SHARED_LIBRARY)
|
||||
endif
|
||||
|
@ -0,0 +1 @@
|
||||
APP_STL := c++_static
|
@ -1,53 +0,0 @@
|
||||
cmake_minimum_required(VERSION 3.10)
|
||||
|
||||
project(whisper.cpp)
|
||||
|
||||
set(CMAKE_CXX_STANDARD 11)
|
||||
set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
|
||||
|
||||
set(
|
||||
SOURCE_FILES
|
||||
${WHISPER_LIB_DIR}/ggml.c
|
||||
${WHISPER_LIB_DIR}/whisper.cpp
|
||||
${CMAKE_SOURCE_DIR}/jni.c
|
||||
)
|
||||
|
||||
find_library(LOG_LIB log)
|
||||
|
||||
function(build_library target_name)
|
||||
add_library(
|
||||
${target_name}
|
||||
SHARED
|
||||
${SOURCE_FILES}
|
||||
)
|
||||
|
||||
target_link_libraries(${target_name} ${LOG_LIB} android)
|
||||
|
||||
if (${target_name} STREQUAL "whisper_v8fp16_va")
|
||||
target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16)
|
||||
elseif (${target_name} STREQUAL "whisper_vfpv4")
|
||||
target_compile_options(${target_name} PRIVATE -mfpu=neon-vfpv4)
|
||||
endif ()
|
||||
|
||||
if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
|
||||
|
||||
target_compile_options(${target_name} PRIVATE -O3)
|
||||
target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
|
||||
target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
|
||||
|
||||
target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
|
||||
target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
|
||||
target_link_options(${target_name} PRIVATE -flto)
|
||||
|
||||
endif ()
|
||||
endfunction()
|
||||
|
||||
build_library("whisper") # Default target
|
||||
|
||||
if (${ANDROID_ABI} STREQUAL "arm64-v8a")
|
||||
build_library("whisper_v8fp16_va")
|
||||
elseif (${ANDROID_ABI} STREQUAL "armeabi-v7a")
|
||||
build_library("whisper_vfpv4")
|
||||
endif ()
|
||||
|
||||
include_directories(${WHISPER_LIB_DIR})
|
18
examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
Normal file
18
examples/whisper.android/app/src/main/jni/whisper/Whisper.mk
Normal file
@ -0,0 +1,18 @@
|
||||
WHISPER_LIB_DIR := $(LOCAL_PATH)/../../../../../../../
|
||||
LOCAL_LDLIBS := -landroid -llog
|
||||
|
||||
# Make the final output library smaller by only keeping the symbols referenced from the app.
|
||||
ifneq ($(APP_OPTIM),debug)
|
||||
LOCAL_CFLAGS += -O3
|
||||
LOCAL_CFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
|
||||
LOCAL_CFLAGS += -ffunction-sections -fdata-sections
|
||||
LOCAL_LDFLAGS += -Wl,--gc-sections
|
||||
LOCAL_LDFLAGS += -Wl,--exclude-libs,ALL
|
||||
LOCAL_LDFLAGS += -flto
|
||||
endif
|
||||
|
||||
LOCAL_CFLAGS += -DSTDC_HEADERS -std=c11 -I $(WHISPER_LIB_DIR)
|
||||
LOCAL_CPPFLAGS += -std=c++11
|
||||
LOCAL_SRC_FILES := $(WHISPER_LIB_DIR)/ggml.c \
|
||||
$(WHISPER_LIB_DIR)/whisper.cpp \
|
||||
$(LOCAL_PATH)/jni.c
|
51
ggml-cuda.cu
51
ggml-cuda.cu
@ -6,60 +6,9 @@
|
||||
#include <atomic>
|
||||
#include <assert.h>
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS)
|
||||
#include <hip/hip_runtime.h>
|
||||
#include <hipblas/hipblas.h>
|
||||
#include <hip/hip_fp16.h>
|
||||
#include <rocblas/rocblas.h>
|
||||
#define CUBLAS_OP_N HIPBLAS_OP_N
|
||||
#define CUBLAS_OP_T HIPBLAS_OP_T
|
||||
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
|
||||
#define CUBLAS_TF32_TENSOR_OP_MATH 0
|
||||
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
|
||||
#define cublasCreate hipblasCreate
|
||||
#define cublasGetStatusString rocblas_status_to_string
|
||||
#define cublasHandle_t hipblasHandle_t
|
||||
#define cublasLoggerConfigure(logIsOn, logToStdOut, logToStdErr, logFileName) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
|
||||
#define cublasSetStream hipblasSetStream
|
||||
#define cublasSgemm hipblasSgemm
|
||||
#define cublasStatus_t hipblasStatus_t
|
||||
#define cudaDeviceProp hipDeviceProp_t
|
||||
#define cudaDeviceSynchronize hipDeviceSynchronize
|
||||
#define cudaError_t hipError_t
|
||||
#define cudaEventCreateWithFlags hipEventCreateWithFlags
|
||||
#define cudaEventDestroy hipEventDestroy
|
||||
#define cudaEventDisableTiming hipEventDisableTiming
|
||||
#define cudaEventRecord hipEventRecord
|
||||
#define cudaEvent_t hipEvent_t
|
||||
#define cudaFree hipFree
|
||||
#define cudaFreeHost hipHostFree
|
||||
#define cudaGetDevice hipGetDevice
|
||||
#define cudaGetDeviceCount hipGetDeviceCount
|
||||
#define cudaGetDeviceProperties hipGetDeviceProperties
|
||||
#define cudaGetErrorString hipGetErrorString
|
||||
#define cudaGetLastError hipGetLastError
|
||||
#define cudaMalloc hipMalloc
|
||||
#define cudaMallocHost hipHostMalloc
|
||||
#define cudaMemcpy hipMemcpy
|
||||
#define cudaMemcpy2DAsync hipMemcpy2DAsync
|
||||
#define cudaMemcpyAsync hipMemcpyAsync
|
||||
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
|
||||
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
|
||||
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
|
||||
#define cudaMemcpyKind hipMemcpyKind
|
||||
#define cudaMemset hipMemset
|
||||
#define cudaSetDevice hipSetDevice
|
||||
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
|
||||
#define cudaStreamNonBlocking hipStreamNonBlocking
|
||||
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
|
||||
#define cudaStream_t hipStream_t
|
||||
#define cudaSuccess hipSuccess
|
||||
#else
|
||||
#include <cuda_runtime.h>
|
||||
#include <cublas_v2.h>
|
||||
#include <cuda_fp16.h>
|
||||
#endif
|
||||
|
||||
#include "ggml-cuda.h"
|
||||
#include "ggml.h"
|
||||
|
@ -653,13 +653,13 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
||||
const int in = tid - step*im; // 0...15 or 0...7
|
||||
|
||||
\n#if K_QUANTS_PER_ITERATION == 1\n
|
||||
#if K_QUANTS_PER_ITERATION == 1
|
||||
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
||||
const int is = 0;
|
||||
\n#else\n
|
||||
#else
|
||||
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
||||
const int is = in / 4;
|
||||
\n#endif\n
|
||||
#endif
|
||||
const int ql_offset = 64*im + l0;
|
||||
const int qh_offset = 32*im + l0;
|
||||
const int s_offset = 8*im + is;
|
||||
@ -676,7 +676,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||
|
||||
const float d = vload_half(0, &x[i].d);
|
||||
|
||||
\n#if K_QUANTS_PER_ITERATION == 1\n
|
||||
#if K_QUANTS_PER_ITERATION == 1
|
||||
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
||||
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
||||
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
||||
@ -686,7 +686,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
||||
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
||||
tmp[16 * ix + tid] += sum;
|
||||
\n#else\n
|
||||
#else
|
||||
float sum = 0;
|
||||
for (int l = 0; l < 4; ++l) {
|
||||
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
||||
@ -695,7 +695,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
||||
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
||||
}
|
||||
tmp[16 * ix + tid] += sum;
|
||||
\n#endif\n
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
23
ggml.c
23
ggml.c
@ -292,7 +292,7 @@ typedef double ggml_float;
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <intrin.h>
|
||||
#else
|
||||
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
||||
#if !defined(__riscv)
|
||||
#include <immintrin.h>
|
||||
#endif
|
||||
#endif
|
||||
@ -663,7 +663,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
|
||||
}
|
||||
|
||||
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
|
||||
#ifdef __AVXVNNI__
|
||||
#if __AVXVNNI__
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
|
||||
return _mm256_cvtepi32_ps(summed_pairs);
|
||||
@ -676,7 +676,7 @@ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy)
|
||||
|
||||
// multiply int8_t, add results pairwise twice and return as float vector
|
||||
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
||||
#ifdef __AVXVNNIINT8__
|
||||
#if __AVXVNNIINT8__
|
||||
const __m256i zero = _mm256_setzero_si256();
|
||||
const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
|
||||
return _mm256_cvtepi32_ps(summed_pairs);
|
||||
@ -692,7 +692,7 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
|
||||
static inline __m128i packNibbles( __m256i bytes )
|
||||
{
|
||||
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
|
||||
#ifdef __AVX512F__
|
||||
#if __AVX512F__
|
||||
const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
|
||||
bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
|
||||
return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
|
||||
@ -4949,13 +4949,6 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
|
||||
return tensor;
|
||||
}
|
||||
|
||||
#ifdef __GNUC__
|
||||
#ifdef __MINGW32__
|
||||
__attribute__((gnu_format(printf, 2, 3)))
|
||||
#else
|
||||
__attribute__((format(printf, 2, 3)))
|
||||
#endif
|
||||
#endif
|
||||
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
|
||||
va_list args;
|
||||
va_start(args, fmt);
|
||||
@ -18728,14 +18721,6 @@ int ggml_cpu_has_sse3(void) {
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_ssse3(void) {
|
||||
#if defined(__SSSE3__)
|
||||
return 1;
|
||||
#else
|
||||
return 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
int ggml_cpu_has_vsx(void) {
|
||||
#if defined(__POWER9_VECTOR__)
|
||||
return 1;
|
||||
|
1
ggml.h
1
ggml.h
@ -1508,7 +1508,6 @@ extern "C" {
|
||||
GGML_API int ggml_cpu_has_clblast (void);
|
||||
GGML_API int ggml_cpu_has_gpublas (void);
|
||||
GGML_API int ggml_cpu_has_sse3 (void);
|
||||
GGML_API int ggml_cpu_has_ssse3 (void);
|
||||
GGML_API int ggml_cpu_has_vsx (void);
|
||||
|
||||
//
|
||||
|
@ -7,7 +7,6 @@ from torch import Tensor
|
||||
from torch import nn
|
||||
from typing import Dict
|
||||
from typing import Optional
|
||||
from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase
|
||||
from coremltools.models.neural_network.quantization_utils import quantize_weights
|
||||
from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
|
||||
from whisper import load_model
|
||||
@ -32,12 +31,12 @@ def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata,
|
||||
state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight']
|
||||
return state_dict
|
||||
|
||||
class LayerNormANE(LayerNormANEBase):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._register_load_state_dict_pre_hook(
|
||||
correct_for_bias_scale_order_inversion)
|
||||
class LayerNorm(nn.LayerNorm):
|
||||
def forward(self, x: Tensor) -> Tensor:
|
||||
x = x.transpose(1,3)
|
||||
x = super().forward(x)
|
||||
x = x.transpose(1,3)
|
||||
return x
|
||||
|
||||
class MultiHeadAttentionANE(MultiHeadAttention):
|
||||
def __init__(self, n_state: int, n_head: int):
|
||||
@ -104,9 +103,9 @@ class ResidualAttentionBlockANE(ResidualAttentionBlock):
|
||||
def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
|
||||
super().__init__(n_state, n_head, cross_attention)
|
||||
self.attn = MultiHeadAttentionANE(n_state, n_head)
|
||||
self.attn_ln = LayerNormANE(n_state)
|
||||
self.attn_ln = LayerNorm(n_state)
|
||||
self.cross_attn = MultiHeadAttentionANE(n_state, n_head) if cross_attention else None
|
||||
self.cross_attn_ln = LayerNormANE(n_state) if cross_attention else None
|
||||
self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
|
||||
|
||||
n_mlp = n_state * 4
|
||||
self.mlp = nn.Sequential(
|
||||
@ -114,7 +113,7 @@ class ResidualAttentionBlockANE(ResidualAttentionBlock):
|
||||
nn.GELU(),
|
||||
nn.Conv2d(n_mlp, n_state, kernel_size=1)
|
||||
)
|
||||
self.mlp_ln = LayerNormANE(n_state)
|
||||
self.mlp_ln = LayerNorm(n_state)
|
||||
|
||||
|
||||
class AudioEncoderANE(AudioEncoder):
|
||||
@ -124,7 +123,7 @@ class AudioEncoderANE(AudioEncoder):
|
||||
self.blocks = nn.ModuleList(
|
||||
[ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
|
||||
)
|
||||
self.ln_post = LayerNormANE(n_state)
|
||||
self.ln_post = LayerNorm(n_state)
|
||||
|
||||
def forward(self, x: Tensor):
|
||||
"""
|
||||
@ -168,7 +167,7 @@ class TextDecoderANE(TextDecoder):
|
||||
self.blocks= nn.ModuleList(
|
||||
[ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
|
||||
)
|
||||
self.ln= LayerNormANE(n_state)
|
||||
self.ln= LayerNorm(n_state)
|
||||
|
||||
def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
|
||||
"""
|
||||
|
@ -8,7 +8,7 @@
|
||||
wd=$(dirname "$0")
|
||||
cd "$wd/../"
|
||||
|
||||
python3 models/convert-whisper-to-coreml.py --model tiny.en
|
||||
python3 models/convert-whisper-to-coreml.py --model tiny.en --optimize-ane True
|
||||
|
||||
mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage
|
||||
xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/
|
||||
|
@ -13,7 +13,7 @@ mname="$1"
|
||||
wd=$(dirname "$0")
|
||||
cd "$wd/../"
|
||||
|
||||
python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True
|
||||
python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True --optimize-ane True
|
||||
|
||||
xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
|
||||
rm -rf models/ggml-${mname}-encoder.mlmodelc
|
||||
|
479
whisper.cpp
479
whisper.cpp
File diff suppressed because it is too large
Load Diff
@ -67,7 +67,6 @@ extern "C" {
|
||||
|
||||
struct whisper_context;
|
||||
struct whisper_state;
|
||||
struct whisper_full_params;
|
||||
|
||||
typedef int whisper_token;
|
||||
|
||||
@ -346,7 +345,7 @@ extern "C" {
|
||||
void * user_data);
|
||||
|
||||
// Parameters for the whisper_full() function
|
||||
// If you change the order or add new parameters, make sure to update the default values in whisper.cpp:
|
||||
// If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
|
||||
// whisper_full_default_params()
|
||||
struct whisper_full_params {
|
||||
enum whisper_sampling_strategy strategy;
|
||||
@ -375,7 +374,6 @@ extern "C" {
|
||||
// [EXPERIMENTAL] speed-up techniques
|
||||
// note: these can significantly reduce the quality of the output
|
||||
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
|
||||
bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
|
||||
int audio_ctx; // overwrite the audio context size (0 = use default)
|
||||
|
||||
// [EXPERIMENTAL] [TDRZ] tinydiarize
|
||||
@ -519,11 +517,6 @@ extern "C" {
|
||||
WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
|
||||
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
|
||||
|
||||
// Control logging output; default behavior is to print to stderr
|
||||
|
||||
typedef void (*whisper_log_callback)(const char * line);
|
||||
WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user