Compare commits

..

1 Commits

Author SHA1 Message Date
8cbc363561 coreml : attempt to fix ANE-optimized models 2023-07-11 23:03:53 +03:00
34 changed files with 463 additions and 26407 deletions

View File

@ -1,41 +1,31 @@
name: CI name: CI
on: [push, pull_request] on: [push, pull_request]
env:
ubuntu_image: "ubuntu:22.04"
jobs: jobs:
ubuntu-latest: ubuntu-latest:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Set up QEMU - name: Dependencies
uses: docker/setup-qemu-action@v2 run: |
sudo apt-get update
- name: Build ${{ matrix.arch }} sudo apt-get install build-essential
sudo apt-get install libsdl2-dev
- name: Build
run: | run: |
docker run --platform ${{ matrix.arch }} --rm \
-v ${{ github.workspace }}:/workspace \
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
apt update
apt install -y build-essential libsdl2-dev
make make
make stream' make stream
macOS-latest: macOS-latest:
runs-on: macOS-latest runs-on: macOS-latest
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Dependencies - name: Dependencies
run: | run: |
@ -47,104 +37,82 @@ jobs:
make make
make stream make stream
freeBSD-latest:
runs-on: macos-12
steps:
- name: Clone
uses: actions/checkout@v3
- name: Build
uses: cross-platform-actions/action@v0.15.0
with:
operating_system: freebsd
version: '13.2'
run: |
sudo pkg update
sudo pkg install -y gmake sdl2
gmake
gmake stream
ubuntu-latest-gcc: ubuntu-latest-gcc:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
fail-fast: false
matrix: matrix:
build: [Debug, Release] build: [Debug, Release]
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Set up QEMU - name: Dependencies
uses: docker/setup-qemu-action@v2 run: |
sudo apt-get update
- name: Build ${{ matrix.arch }} sudo apt-get install build-essential
sudo apt-get install cmake
sudo apt-get install libsdl2-dev
- name: Configure
run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
- name: Build
run: | run: |
docker run --platform ${{ matrix.arch }} --rm \
-v ${{ github.workspace }}:/workspace \
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
apt update
apt install -y build-essential cmake libsdl2-dev
cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }}
make make
ctest -L gh --output-on-failure' ctest -L gh --output-on-failure
ubuntu-latest-clang: ubuntu-latest-clang:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
fail-fast: false
matrix: matrix:
build: [Debug, Release] build: [Debug, Release]
arch: [linux/amd64, linux/arm64, linux/arm/v7, linux/ppc64le]
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Set up QEMU - name: Dependencies
uses: docker/setup-qemu-action@v2 run: |
sudo apt-get update
- name: Build ${{ matrix.arch }} sudo apt-get install build-essential
sudo apt-get install cmake
sudo apt-get install libsdl2-dev
- name: Configure
run: cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
- name: Build
run: | run: |
docker run --platform ${{ matrix.arch }} --rm \
-v ${{ github.workspace }}:/workspace \
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
apt update
apt install -y build-essential cmake libsdl2-dev
cmake . -DWHISPER_SUPPORT_SDL2=ON -DCMAKE_BUILD_TYPE=${{ matrix.build }} -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_C_COMPILER=clang
make make
ctest -L gh --output-on-failure' ctest -L gh --output-on-failure
ubuntu-latest-gcc-sanitized: ubuntu-latest-gcc-sanitized:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
fail-fast: false
matrix: matrix:
sanitizer: [ADDRESS, THREAD, UNDEFINED] sanitizer: [ADDRESS, THREAD, UNDEFINED]
arch: [linux/amd64]
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Set up QEMU - name: Dependencies
uses: docker/setup-qemu-action@v2 run: |
sudo apt-get update
- name: Build ${{ matrix.arch }} sudo apt-get install build-essential
sudo apt-get install cmake
- name: Configure
run: cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
- name: Build
run: | run: |
docker run --platform ${{ matrix.arch }} --rm \
-v ${{ github.workspace }}:/workspace \
-w /workspace ${{ env.ubuntu_image }} /bin/sh -c '
apt update
apt install -y build-essential cmake
cmake . -DCMAKE_BUILD_TYPE=Debug -DWHISPER_SANITIZE_${{ matrix.sanitizer }}=ON
make make
ctest -L gh --output-on-failure' ctest -L gh --output-on-failure
windows: windows:
runs-on: windows-latest runs-on: windows-latest
@ -166,7 +134,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Add msbuild to PATH - name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1 uses: microsoft/setup-msbuild@v1
@ -227,7 +195,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Add msbuild to PATH - name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1 uses: microsoft/setup-msbuild@v1
@ -293,7 +261,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Add msbuild to PATH - name: Add msbuild to PATH
uses: microsoft/setup-msbuild@v1 uses: microsoft/setup-msbuild@v1
@ -340,16 +308,24 @@ jobs:
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Setup emsdk - name: Dependencies
uses: mymindstorm/setup-emsdk@v12 run: |
wget -q https://github.com/emscripten-core/emsdk/archive/master.tar.gz
tar -xvf master.tar.gz
emsdk-master/emsdk update
emsdk-master/emsdk install latest
emsdk-master/emsdk activate latest
- name: Verify - name: Configure
run: emcc -v run: echo "tmp"
- name: Build - name: Build
run: | run: |
pushd emsdk-master
source ./emsdk_env.sh
popd
emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }} emcmake cmake . -DCMAKE_BUILD_TYPE=${{ matrix.build }}
make make
@ -362,7 +338,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Configure - name: Configure
run: | run: |
@ -380,7 +356,7 @@ jobs:
steps: steps:
- name: Clone - name: Clone
uses: actions/checkout@v3 uses: actions/checkout@v1
- name: Install Java - name: Install Java
uses: actions/setup-java@v3 uses: actions/setup-java@v3
@ -400,7 +376,7 @@ jobs:
needs: [ 'windows' ] needs: [ 'windows' ]
runs-on: windows-latest runs-on: windows-latest
steps: steps:
- uses: actions/checkout@v3 - uses: actions/checkout@v1
- name: Install Java - name: Install Java
uses: actions/setup-java@v1 uses: actions/setup-java@v1
@ -426,24 +402,11 @@ jobs:
name: whispercpp.jar name: whispercpp.jar
path: bindings/java/build/libs/whispercpp-*.jar path: bindings/java/build/libs/whispercpp-*.jar
- name: Publish package # - name: Publish package
if: ${{ github.ref == 'refs/heads/master' }} # if: ${{ github.ref == 'refs/heads/master' }}
uses: gradle/gradle-build-action@v2 # uses: gradle/gradle-build-action@v2
with: # with:
arguments: publish # arguments: publish
env: # env:
MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }} # MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }} # MAVEN_PASSWORD: ${{ secrets.OSSRH_TOKEN }}
quantize:
runs-on: ubuntu-latest
steps:
- name: Clone
uses: actions/checkout@v3
- name: Test quantize
run: |
./models/download-ggml-model.sh tiny.en
make quantize
./quantize models/ggml-tiny.en.bin models/ggml-tiny.en-q4_0.bin q4_0

1
.gitignore vendored
View File

@ -24,7 +24,6 @@ build-sanitize-thread/
/talk-llama /talk-llama
/bench /bench
/quantize /quantize
/lsp
arm_neon.h arm_neon.h
sync.sh sync.sh

View File

@ -65,7 +65,6 @@ else()
option(WHISPER_BLAS_VENDOR "whisper: BLAS library vendor" Generic) option(WHISPER_BLAS_VENDOR "whisper: BLAS library vendor" Generic)
option(WHISPER_OPENBLAS "whisper: prefer OpenBLAS" OFF) option(WHISPER_OPENBLAS "whisper: prefer OpenBLAS" OFF)
option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF) option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
option(WHISPER_HIPBLAS "whisper: support for hipBLAS" OFF)
option(WHISPER_CLBLAST "whisper: use CLBlast" OFF) option(WHISPER_CLBLAST "whisper: use CLBlast" OFF)
endif() endif()
@ -137,17 +136,6 @@ if (WHISPER_OPENBLAS)
endif() endif()
if (WHISPER_BLAS) if (WHISPER_BLAS)
if (WIN32)
if(DEFINED ENV{OPENBLAS_PATH})
set(BLAS_LIBRARIES $ENV{OPENBLAS_PATH}/lib/libopenblas.dll.a)
message(STATUS "Libraries ${BLAS_LIBRARIES}")
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
include_directories($ENV{OPENBLAS_PATH}/include)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
else ()
message(WARNING "BLAS library was not found. Environment variable OPENBLAS_PATH not defined.")
endif ()
else ()
set(BLA_STATIC 1) set(BLA_STATIC 1)
set(BLA_VENDOR ${WHISPER_BLAS_VENDOR}) set(BLA_VENDOR ${WHISPER_BLAS_VENDOR})
# set(BLA_PREFER_PKGCONFIG 1) # set(BLA_PREFER_PKGCONFIG 1)
@ -157,15 +145,14 @@ if (WHISPER_BLAS)
if(BLAS_FOUND) if(BLAS_FOUND)
message(STATUS "BLAS compatible library found") message(STATUS "BLAS compatible library found")
message(STATUS "Libraries ${BLAS_LIBRARIES}") message(STATUS "Libraries ${BLAS_LIBRARIES}")
find_path(BLAS_INCLUDE_DIRS cblas.h /usr/include/openblas /usr/local/include/openblas $ENV{BLAS_HOME}/include)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS) set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DGGML_USE_OPENBLAS)
include_directories(${BLAS_INCLUDE_DIRS}) include_directories(${BLAS_INCLUDE_DIRS})
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES}) set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ${BLAS_LIBRARIES})
else() else()
message(WARNING "BLAS library was not found") message(WARNING "BLAS library was not found")
endif() endif()
endif () endif ()
endif ()
if (WHISPER_CUBLAS) if (WHISPER_CUBLAS)
cmake_minimum_required(VERSION 3.17) cmake_minimum_required(VERSION 3.17)
@ -192,37 +179,6 @@ if (WHISPER_CUBLAS)
endif() endif()
endif() endif()
if (WHISPER_HIPBLAS)
list(APPEND CMAKE_PREFIX_PATH /opt/rocm)
if (NOT ${CMAKE_C_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CC=/opt/rocm/llvm/bin/clang")
endif()
if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
endif()
find_package(hip)
find_package(hipblas)
find_package(rocblas)
if (${hipblas_FOUND} AND ${hip_FOUND})
message(STATUS "HIP and hipBLAS found")
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
set_property(TARGET ggml-rocm PROPERTY POSITION_INDEPENDENT_CODE ON)
set_source_files_properties(ggml-cuda.cu PROPERTIES LANGUAGE CXX)
target_link_libraries(ggml-rocm PRIVATE hip::device PUBLIC hip::host roc::rocblas roc::hipblas)
if (WHISPER_STATIC)
message(FATAL_ERROR "Static linking not supported for HIP/ROCm")
endif()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} ggml-rocm)
else()
message(WARNING "hipBLAS or HIP not found. Try setting CMAKE_PREFIX_PATH=/opt/rocm")
endif()
endif()
if (WHISPER_CLBLAST) if (WHISPER_CLBLAST)
find_package(CLBlast) find_package(CLBlast)
if (CLBlast_FOUND) if (CLBlast_FOUND)
@ -281,14 +237,9 @@ message(STATUS "CMAKE_SYSTEM_PROCESSOR: ${CMAKE_SYSTEM_PROCESSOR}")
if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64") if (${CMAKE_SYSTEM_PROCESSOR} MATCHES "arm" OR ${CMAKE_SYSTEM_PROCESSOR} MATCHES "aarch64")
message(STATUS "ARM detected") message(STATUS "ARM detected")
elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "ppc64le")
message(STATUS "PowerPC detected")
else() else()
message(STATUS "x86 detected") message(STATUS "x86 detected")
if (MSVC) if (MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /utf-8")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /utf-8")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /utf-8")
if(NOT WHISPER_NO_AVX2) if(NOT WHISPER_NO_AVX2)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")

123
Makefile
View File

@ -12,12 +12,6 @@ ifndef UNAME_M
UNAME_M := $(shell uname -m) UNAME_M := $(shell uname -m)
endif endif
ifndef NVCC_VERSION
ifeq ($(call,$(shell which nvcc))$(.SHELLSTATUS),0)
NVCC_VERSION := $(shell nvcc --version | egrep -o "V[0-9]+.[0-9]+.[0-9]+" | cut -c2-)
endif
endif
CCV := $(shell $(CC) --version | head -n 1) CCV := $(shell $(CC) --version | head -n 1)
CXXV := $(shell $(CXX) --version | head -n 1) CXXV := $(shell $(CXX) --version | head -n 1)
@ -57,7 +51,19 @@ endif
# OS specific # OS specific
# TODO: support Windows # TODO: support Windows
ifeq ($(filter $(UNAME_S),Linux Darwin DragonFly FreeBSD NetBSD OpenBSD Haiku),$(UNAME_S)) ifeq ($(UNAME_S),Linux)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Darwin)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),FreeBSD)
CFLAGS += -pthread
CXXFLAGS += -pthread
endif
ifeq ($(UNAME_S),Haiku)
CFLAGS += -pthread CFLAGS += -pthread
CXXFLAGS += -pthread CXXFLAGS += -pthread
endif endif
@ -67,50 +73,60 @@ endif
# feel free to update the Makefile for your architecture and send a pull request or issue # feel free to update the Makefile for your architecture and send a pull request or issue
ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686)) ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
CPUINFO_CMD := sysctl machdep.cpu.features CFLAGS += -mf16c
else ifeq ($(UNAME_S),Linux) AVX1_M := $(shell sysctl machdep.cpu.features)
CPUINFO_CMD := cat /proc/cpuinfo ifneq (,$(findstring FMA,$(AVX1_M)))
else ifneq (,$(filter MINGW32_NT% MINGW64_NT%,$(UNAME_S))) CFLAGS += -mfma
CPUINFO_CMD := cat /proc/cpuinfo
else ifeq ($(UNAME_S),Haiku)
CPUINFO_CMD := sysinfo -cpu
endif endif
ifneq (,$(findstring AVX1.0,$(AVX1_M)))
ifdef CPUINFO_CMD
AVX_M := $(shell $(CPUINFO_CMD) | grep -m 1 "avx ")
ifneq (,$(findstring avx,$(AVX_M)))
CFLAGS += -mavx CFLAGS += -mavx
endif endif
AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
AVX2_M := $(shell $(CPUINFO_CMD) | grep -m 1 "avx2 ") ifneq (,$(findstring AVX2,$(AVX2_M)))
CFLAGS += -mavx2
endif
else ifeq ($(UNAME_S),Linux)
AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
ifneq (,$(findstring avx2,$(AVX2_M))) ifneq (,$(findstring avx2,$(AVX2_M)))
CFLAGS += -mavx2 CFLAGS += -mavx2
endif endif
FMA_M := $(shell grep "fma " /proc/cpuinfo)
FMA_M := $(shell $(CPUINFO_CMD) | grep -m 1 "fma ")
ifneq (,$(findstring fma,$(FMA_M))) ifneq (,$(findstring fma,$(FMA_M)))
CFLAGS += -mfma CFLAGS += -mfma
endif endif
F16C_M := $(shell grep "f16c " /proc/cpuinfo)
F16C_M := $(shell $(CPUINFO_CMD) | grep -m 1 "f16c ")
ifneq (,$(findstring f16c,$(F16C_M))) ifneq (,$(findstring f16c,$(F16C_M)))
CFLAGS += -mf16c CFLAGS += -mf16c
AVX1_M := $(shell $(CPUINFO_CMD) | grep -m 1 "avx ") AVX1_M := $(shell grep "avx " /proc/cpuinfo)
ifneq (,$(findstring avx,$(AVX1_M))) ifneq (,$(findstring avx,$(AVX1_M)))
CFLAGS += -mavx CFLAGS += -mavx
endif endif
endif endif
SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
SSE3_M := $(shell $(CPUINFO_CMD) | grep -m 1 "sse3 ")
ifneq (,$(findstring sse3,$(SSE3_M))) ifneq (,$(findstring sse3,$(SSE3_M)))
CFLAGS += -msse3 CFLAGS += -msse3
endif endif
else ifeq ($(UNAME_S),Haiku)
SSSE3_M := $(shell $(CPUINFO_CMD) | grep -m 1 "ssse3 ") AVX2_M := $(shell sysinfo -cpu | grep "AVX2 ")
ifneq (,$(findstring ssse3,$(SSSE3_M))) ifneq (,$(findstring avx2,$(AVX2_M)))
CFLAGS += -mssse3 CFLAGS += -mavx2
endif endif
FMA_M := $(shell sysinfo -cpu | grep "FMA ")
ifneq (,$(findstring fma,$(FMA_M)))
CFLAGS += -mfma
endif
F16C_M := $(shell sysinfo -cpu | grep "F16C ")
ifneq (,$(findstring f16c,$(F16C_M)))
CFLAGS += -mf16c
AVX1_M := $(shell sysinfo -cpu | grep "AVX ")
ifneq (,$(findstring avx,$(AVX1_M)))
CFLAGS += -mavx
endif
endif
else
CFLAGS += -mfma -mf16c -mavx -mavx2
endif endif
endif endif
ifeq ($(UNAME_M),amd64) ifeq ($(UNAME_M),amd64)
@ -146,56 +162,29 @@ endif
endif endif
ifdef WHISPER_OPENBLAS ifdef WHISPER_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas -I/usr/include/openblas CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas LDFLAGS += -lopenblas
endif endif
ifdef WHISPER_CUBLAS ifdef WHISPER_CUBLAS
ifeq ($(shell expr $(NVCC_VERSION) \>= 11.6), 1)
CUDA_ARCH_FLAG=native
else
CUDA_ARCH_FLAG=all
endif
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib
WHISPER_OBJ += ggml-cuda.o WHISPER_OBJ += ggml-cuda.o
NVCC = nvcc NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=$(CUDA_ARCH_FLAG) NVCCFLAGS = --forward-unknown-to-host-compiler -arch=any
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@ $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif endif
ifdef WHISPER_HIPBLAS
ROCM_PATH ?= /opt/rocm
HIPCC ?= $(ROCM_PATH)/bin/hipcc
GPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS
LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
LDFLAGS += -lhipblas -lamdhip64 -lrocblas
HIPFLAGS += $(addprefix --offload-arch=,$(GPU_TARGETS))
WHISPER_OBJ += ggml-cuda.o
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(HIPCC) $(CXXFLAGS) $(HIPFLAGS) -x hip -c -o $@ $<
endif
ifdef WHISPER_CLBLAST ifdef WHISPER_CLBLAST
CFLAGS += -DGGML_USE_CLBLAST CFLAGS += -DGGML_USE_CLBLAST
CXXFLAGS += -DGGML_USE_CLBLAST LDFLAGS += -lclblast -lOpenCL
LDFLAGS += -lclblast
ifeq ($(UNAME_S),Darwin)
LDFLAGS += -framework OpenCL
else
LDFLAGS += -lOpenCL
endif
WHISPER_OBJ += ggml-opencl.o WHISPER_OBJ += ggml-opencl.o
ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h ggml-opencl.o: ggml-opencl.cpp ggml-opencl.h
$(CXX) $(CXXFLAGS) -c $< -o $@ $(CC) $(CFLAGS) -c $< -o $@
endif endif
ifdef WHISPER_GPROF ifdef WHISPER_GPROF
@ -273,7 +262,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
clean: clean:
rm -f *.o main stream command talk talk-llama bench quantize lsp libwhisper.a libwhisper.so rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
# #
# Examples # Examples
@ -300,9 +289,6 @@ stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHIS
command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) command: examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/command/command.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o command $(CC_SDL) $(LDFLAGS)
lsp: examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/lsp/lsp.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o lsp $(CC_SDL) $(LDFLAGS)
talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) talk: examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/talk/talk.cpp examples/talk/gpt-2.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o talk $(CC_SDL) $(LDFLAGS)
@ -323,7 +309,6 @@ samples:
@wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg @wget --quiet --show-progress -O samples/hp0.ogg https://upload.wikimedia.org/wikipedia/en/d/d4/En.henryfphillips.ogg
@wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav @wget --quiet --show-progress -O samples/mm1.wav https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav
@wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3 @wget --quiet --show-progress -O samples/a13.mp3 https://upload.wikimedia.org/wikipedia/commons/transcoded/6/6f/Apollo13-wehaveaproblem.ogg/Apollo13-wehaveaproblem.ogg.mp3
@wget --quiet --show-progress -O samples/diffusion2023-07-03.flac https://archive.org/download/diffusion2023-07-03/diffusion2023-07-03.flac
@echo "Converting to 16-bit WAV ..." @echo "Converting to 16-bit WAV ..."
@ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav @ffmpeg -loglevel -0 -y -i samples/gb0.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb0.wav
@ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav @ffmpeg -loglevel -0 -y -i samples/gb1.ogg -ar 16000 -ac 1 -c:a pcm_s16le samples/gb1.wav
@ -333,8 +318,6 @@ samples:
@rm samples/mm1.wav @rm samples/mm1.wav
@ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav @ffmpeg -loglevel -0 -y -i samples/a13.mp3 -ar 16000 -ac 1 -c:a pcm_s16le -ss 00:00:00 -to 00:00:30 samples/a13.wav
@rm samples/a13.mp3 @rm samples/a13.mp3
@ffmpeg -loglevel -0 -y -i samples/diffusion2023-07-03.flac -ar 16000 -ac 1 -c:a pcm_s16le samples/diffusion2023-07-03.wav
@rm samples/diffusion2023-07-03.flac
# #
# Models # Models
@ -376,4 +359,4 @@ tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
.PHONY: tests .PHONY: tests
tests: tests:
bash ./tests/run-tests.sh $(word 2, $(MAKECMDGOALS)) bash ./tests/run-tests.sh

View File

@ -22,7 +22,6 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas) - [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
- [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast) - [Partial OpenCL GPU support via CLBlast](https://github.com/ggerganov/whisper.cpp#opencl-gpu-support-via-clblast)
- [BLAS CPU support via OpenBLAS](https://github.com/ggerganov/whisper.cpp#blas-cpu-support-via-openblas) - [BLAS CPU support via OpenBLAS](https://github.com/ggerganov/whisper.cpp#blas-cpu-support-via-openblas)
- [OpenVINO Support](https://github.com/ggerganov/whisper.cpp#openvino-support)
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h) - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
Supported platforms: Supported platforms:
@ -61,7 +60,7 @@ Or you can even run it straight in the browser: [talk.wasm](examples/talk.wasm)
- Various other examples are available in the [examples](examples) folder - Various other examples are available in the [examples](examples) folder
The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD The tensor operators are optimized heavily for Apple silicon CPUs. Depending on the computation size, Arm Neon SIMD
intrinsics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since instrisics or CBLAS Accelerate framework routines are used. The latter are especially effective for bigger sizes since
the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products. the Accelerate framework utilizes the special-purpose AMX coprocessor available in modern Apple products.
## Quick start ## Quick start
@ -312,85 +311,6 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566). For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
## OpenVINO support
On platforms that support [OpenVINO](https://github.com/openvinotoolkit/openvino), the Encoder inference can be executed
on OpenVINO-supported devices including x86 CPUs and Intel GPUs (integrated & discrete).
This can result in significant speedup in encoder performance. Here are the instructions for generating the OpenVINO model and using it with `whisper.cpp`:
- First, setup python virtual env. and install python dependencies. Python 3.10 is recommended.
Windows:
```
cd models
python -m venv openvino_conv_env
openvino_conv_env\Scripts\activate
python -m pip install --upgrade pip
pip install -r openvino-conversion-requirements.txt
```
Linux and macOS:
```
cd models
python3 -m venv openvino_conv_env
source openvino_conv_env/bin/activate
python -m pip install --upgrade pip
pip install -r openvino-conversion-requirements.txt
```
- Generate an OpenVINO encoder model. For example, to generate a `base.en` model, use:
```
python convert-whisper-to-openvino.py --model base.en
```
This will produce ggml-base.en-encoder-openvino.xml/.bin IR model files. It's recommended to relocate these to the same folder as ggml models, as that
is the default location that the OpenVINO extension will search at runtime.
- Build `whisper.cpp` with OpenVINO support:
Download OpenVINO package from [release page](https://github.com/openvinotoolkit/openvino/releases). The recommended version to use is [2023.0.0](https://github.com/openvinotoolkit/openvino/releases/tag/2023.0.0).
After downloading & extracting package onto your development system, set up required environment by sourcing setupvars script. For example:
Linux:
```bash
source /path/to/l_openvino_toolkit_ubuntu22_2023.0.0.10926.b4452d56304_x86_64/setupvars.sh
```
Windows (cmd):
```
C:\Path\To\w_openvino_toolkit_windows_2023.0.0.10926.b4452d56304_x86_64\setupvars.bat
```
And then build the project using cmake:
```bash
cd build
cmake -DWHISPER_OPENVINO=1 ..
```
- Run the examples as usual. For example:
```bash
./main -m models/ggml-base.en.bin -f samples/jfk.wav
...
whisper_ctx_init_openvino_encoder: loading OpenVINO model from 'models/ggml-base.en-encoder-openvino.xml'
whisper_ctx_init_openvino_encoder: first run on a device may take a while ...
whisper_openvino_init: path_model = models/ggml-base.en-encoder-openvino.xml, device = GPU, cache_dir = models/ggml-base.en-encoder-openvino-cache
whisper_ctx_init_openvino_encoder: OpenVINO model loaded
system_info: n_threads = 4 / 8 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | VSX = 0 | COREML = 0 | OPENVINO = 1 |
...
```
The first time run on an OpenVINO device is slow, since the OpenVINO framework will compile the IR (Intermediate Representation) model to a device-specific 'blob'. This device-specific blob will get
cached for the next run.
For more information about the Core ML implementation please refer to PR [#1037](https://github.com/ggerganov/whisper.cpp/pull/1037).
## NVIDIA GPU support via cuBLAS ## NVIDIA GPU support via cuBLAS
With NVIDIA cards the Encoder processing can to a large extent be offloaded to the GPU through cuBLAS. With NVIDIA cards the Encoder processing can to a large extent be offloaded to the GPU through cuBLAS.

View File

@ -19,10 +19,6 @@ func (p *Params) SetTranslate(v bool) {
p.translate = toBool(v) p.translate = toBool(v)
} }
func (p *Params) SetSplitOnWord(v bool) {
p.split_on_word = toBool(v)
}
func (p *Params) SetNoContext(v bool) { func (p *Params) SetNoContext(v bool) {
p.no_context = toBool(v) p.no_context = toBool(v)
} }

View File

@ -81,10 +81,6 @@ func (context *context) SetSpeedup(v bool) {
context.params.SetSpeedup(v) context.params.SetSpeedup(v)
} }
func (context *context) SetSplitOnWord(v bool) {
context.params.SetSplitOnWord(v)
}
// Set number of threads to use // Set number of threads to use
func (context *context) SetThreads(v uint) { func (context *context) SetThreads(v uint) {
context.params.SetThreads(int(v)) context.params.SetThreads(int(v))

View File

@ -42,7 +42,6 @@ type Context interface {
SetDuration(time.Duration) // Set duration SetDuration(time.Duration) // Set duration
SetThreads(uint) // Set number of threads to use SetThreads(uint) // Set number of threads to use
SetSpeedup(bool) // Set speedup flag SetSpeedup(bool) // Set speedup flag
SetSplitOnWord(bool) // Set split on word flag
SetTokenThreshold(float32) // Set timestamp token probability threshold SetTokenThreshold(float32) // Set timestamp token probability threshold
SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold SetTokenSumThreshold(float32) // Set timestamp token sum probability threshold
SetMaxSegmentLength(uint) // Set max segment length in characters SetMaxSegmentLength(uint) // Set max segment length in characters

View File

@ -31,10 +31,10 @@ API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((v
API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden"))) API_AVAILABLE(macos(12.0), ios(15.0), watchos(8.0), tvos(15.0)) __attribute__((visibility("hidden")))
@interface whisper_decoder_implOutput : NSObject<MLFeatureProvider> @interface whisper_decoder_implOutput : NSObject<MLFeatureProvider>
/// var_1346 as multidimensional array of floats /// var_1195 as multidimensional array of floats
@property (readwrite, nonatomic, strong) MLMultiArray * var_1346; @property (readwrite, nonatomic, strong) MLMultiArray * var_1195;
- (instancetype)init NS_UNAVAILABLE; - (instancetype)init NS_UNAVAILABLE;
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 NS_DESIGNATED_INITIALIZER; - (instancetype)initWithVar_1195:(MLMultiArray *)var_1195 NS_DESIGNATED_INITIALIZER;
@end @end

View File

@ -39,21 +39,21 @@
@implementation whisper_decoder_implOutput @implementation whisper_decoder_implOutput
- (instancetype)initWithVar_1346:(MLMultiArray *)var_1346 { - (instancetype)initWithVar_1195:(MLMultiArray *)var_1195 {
self = [super init]; self = [super init];
if (self) { if (self) {
_var_1346 = var_1346; _var_1195 = var_1195;
} }
return self; return self;
} }
- (NSSet<NSString *> *)featureNames { - (NSSet<NSString *> *)featureNames {
return [NSSet setWithArray:@[@"var_1346"]]; return [NSSet setWithArray:@[@"var_1195"]];
} }
- (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName { - (nullable MLFeatureValue *)featureValueForName:(NSString *)featureName {
if ([featureName isEqualToString:@"var_1346"]) { if ([featureName isEqualToString:@"var_1195"]) {
return [MLFeatureValue featureValueWithMultiArray:self.var_1346]; return [MLFeatureValue featureValueWithMultiArray:self.var_1195];
} }
return nil; return nil;
} }
@ -177,7 +177,7 @@
- (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error { - (nullable whisper_decoder_implOutput *)predictionFromFeatures:(whisper_decoder_implInput *)input options:(MLPredictionOptions *)options error:(NSError * _Nullable __autoreleasing * _Nullable)error {
id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error]; id<MLFeatureProvider> outFeatures = [self.model predictionFromFeatures:input options:options error:error];
if (!outFeatures) { return nil; } if (!outFeatures) { return nil; }
return [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[outFeatures featureValueForName:@"var_1346"].multiArrayValue]; return [[whisper_decoder_implOutput alloc] initWithVar_1195:(MLMultiArray *)[outFeatures featureValueForName:@"var_1195"].multiArrayValue];
} }
- (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error { - (nullable whisper_decoder_implOutput *)predictionFromToken_data:(MLMultiArray *)token_data audio_data:(MLMultiArray *)audio_data error:(NSError * _Nullable __autoreleasing * _Nullable)error {
@ -192,7 +192,7 @@
NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count]; NSMutableArray<whisper_decoder_implOutput*> *results = [NSMutableArray arrayWithCapacity:(NSUInteger)outBatch.count];
for (NSInteger i = 0; i < outBatch.count; i++) { for (NSInteger i = 0; i < outBatch.count; i++) {
id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i]; id<MLFeatureProvider> resultProvider = [outBatch featuresAtIndex:i];
whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1346:(MLMultiArray *)[resultProvider featureValueForName:@"var_1346"].multiArrayValue]; whisper_decoder_implOutput * result = [[whisper_decoder_implOutput alloc] initWithVar_1195:(MLMultiArray *)[resultProvider featureValueForName:@"var_1195"].multiArrayValue];
[results addObject:result]; [results addObject:result];
} }
return results; return results;

View File

@ -53,12 +53,10 @@ void whisper_coreml_encode(
error: nil error: nil
]; ];
@autoreleasepool {
whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil]; whisper_encoder_implOutput * outCoreML = [(__bridge id) ctx->data predictionFromLogmel_data:inMultiArray error:nil];
memcpy(out, outCoreML.output.dataPointer, outCoreML.output.count * sizeof(float)); memcpy(out, outCoreML.output.dataPointer, outCoreML.output.count * sizeof(float));
} }
}
#if __cplusplus #if __cplusplus
} }

View File

@ -69,5 +69,4 @@ else()
add_subdirectory(quantize) add_subdirectory(quantize)
add_subdirectory(talk) add_subdirectory(talk)
add_subdirectory(talk-llama) add_subdirectory(talk-llama)
add_subdirectory(lsp)
endif() endif()

View File

@ -1,9 +0,0 @@
if (WHISPER_SDL2)
# stream
set(TARGET lsp)
add_executable(${TARGET} lsp.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
endif ()

View File

@ -1,104 +0,0 @@
# Language Server
This example consists of a simple language server to expose both unguided
and guided (command) transcriptions by sending json messages over stdout/stdin
as well as a rather robust vim plugin that makes use of the language server.
## Vim plugin quick start
Compile the language server with
```bash
make lsp
```
Install the plugin itself by copying or symlinking whisper.vim into ~/.vim/autoload/
In your vimrc, set the path of your whisper.cpp directory and optionally add some keybinds.
```vim
let g:whisper_dir = "~/whisper.cpp"
" Start listening for commands when Ctrl - g is pressed in normal mode
nnoremap <C-G> call whisper#requestCommands()<CR>
" Start unguided transcription when Ctrl - g is pressed in insert mode
inoremap <C-G> <Cmd>call whisper#doTranscription()<CR>
```
## Vim plugin usage
The vim plugin was designed to closely follow the mnemonics of vim
`s:spoken_dict` is used to translate keys to their spoken form.
Keys corresponding to a string use that spoken value normally and when a motion is expected, but use the key itself when a character is expected.
Keys corresponding to a dict, like `i`, can have manual difinitions given to each possible commandset.
0 is normal (insert), 1 is motion (inside), 2 is it's usage as a single key ([till] i), and 3 is it's usage in an area selection (s -> [around] sentence)
Some punctuation items, like `-` are explicitly given pronunciations to prevent them from being picked as punctuation instead of an actual command word.
Not all commands will tokenize to a single token and this can interfere with interpretation. "yank" as an example, takes multiple tokens and correspondingly, will give more accurate detection when only the first "ya" is used. While it could be changed to something else that is a single token (copy), value was placed on maintaining vim mnemonics.
Commands that would normally move the editor into insert mode (insert, append, open, change) will begin unguided transcription.
Unguided transcription will end when a speech segment ends in exit.
Presence of punctuation can be designated by whether or not you add a pause between the previous speech segment and exit.
Exiting only occurs if exit is the last word, so "Take the first exit on your right" would not cause transcription to end.
After a command is evaluated, the plugin will continue listening for the next command.
While in command mode, "Exit" will end listening.
A best effort approach is taken to keep track of audio that is recorded while a previous chunk is still processing and immediately interpret it afterwards, but the current voice detection still needs a fairly sizable gap to determine when a command has been spoken.
Log information is sent to a special `whisper_log` buffer and can be accessed with
```vim
:e whisper_log
```
## Vim plugin configuration
`g:whisper_dir`
A full path to the whisper.cpp repo. It can be expanded in the definition like so:
```vim
let g:whisper_dir = expand("~/whisper.cpp/")
```
(The WHISPER_CPP_HOME environment variable is also checked for users of the existing whisper.nvim script)
`g:whisper_lsp_path`
Can be used to manually set the path to the language server.
If not defined, it will be inferred from the above whisper_dir
`g:whisper_model_path`
A full path to the model to load. If not defined, it will default to ggml-base.en.bin
`g:whisper_user_commands`
A dictionary of spoken commands that correspond to either strings or funcrefs.
This can be used to create connections with other user plugins, for example
```vim
let g:whisper_user_commands = {"gen": "llama#doLlamaGen"}
```
will trigger the llama.cpp plugin to begin generation when "gen" is spoken
## Language server methods
`registerCommandset`
`params` is a list of strings that should be checked for with this commandset. The server prepends a space to these strings before tokenizing.
Responds with
`result.index` an integer index for the commandset registered, which should be included when initiating a guided transcription to select this commandset.
Will return an error if any of the commands in the commandset have duplicate tokenizations
`guided`
`params.commandset_index` An index returned by a corresponding commandset registration. If not set, the most recently registered commandset is used.
`params.timestamp` A positive unsigned integer which designates a point in time which audio should begin processing from. If left blank, the start point of audio processing will be the moment the message is recieved. This should be left blank unless you have a timestamp from a previous response.
Responds with
`result.command_index` The numerical index (starting from 0) of the detected command in the selected commandset
`result.command_text` A string containing the command as provided in the commandset
`result.timestamp` A positive unsigned integer that designates the point in time which audio stopped being processed at. Pass this timestamp back in a subsequent message to mask the latency of transcription.
`unguided`
`params.no_context` Sets the corresponding whisper `no_context` param. Defaults to true. Might provide more accurate results for consecutive unguided transcriptions if those after the first are set to false.
`params.prompt` If provided, sets the initial prompt used during transcription.
`params.timestamp` A positive unsigned integer which designates a point in time which audio should begin processing from. If left blank, the start point of audio processing will be the moment the message is recieved. This should be left blank unless you have a timestamp from a previous response.
Responds with
`result.transcription` A string containing the transcribed text. N.B. This will almost always start with a space due to how text is tokenized.
`result.timestamp` A positive unsigned integer that designates the point in time which audio stopped being processed at. Pass this timestamp back in a subsequent message to mask the latency of transcription.

File diff suppressed because it is too large Load Diff

View File

@ -1,458 +0,0 @@
#include "common.h"
#include "common-sdl.h"
#include "whisper.h"
#include "json.hpp"
#include <iostream>
#include <cassert>
#include <cstdio>
#include <string>
#include <thread>
#include <vector>
#include <deque>
#include <set>
using json = nlohmann::json;
// command-line parameters
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t prompt_ms = 5000;
int32_t command_ms = 8000;
int32_t capture_id = -1;
int32_t max_tokens = 32;
int32_t audio_ctx = 0;
float vad_thold = 0.6f;
float freq_thold = 100.0f;
bool speed_up = false;
bool translate = false;
bool print_special = false;
bool print_energy = false;
std::string language = "en";
std::string model = "models/ggml-base.en.bin";
};
struct command {
std::vector<whisper_token> tokens;
std::string plaintext;
};
struct commandset {
std::vector<struct command> commands;
std::vector<whisper_token> prompt_tokens;
// TODO: Store longest command?
// Multi-token commands should have probabilities of subsequent logits
// given that the prior logit is correct.
// In this case, all commands must be iterated.
// This however, is likely highly involved as different tokens
// almost certainly have different spoken lengths
// It would also have performance implications equivalent to a beam search
};
void whisper_print_usage(int argc, char ** argv, const whisper_params & params);
bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-h" || arg == "--help") {
whisper_print_usage(argc, argv, params);
exit(0);
}
else if (arg == "-t" || arg == "--threads") { params.n_threads = std::stoi(argv[++i]); }
else if (arg == "-pms" || arg == "--prompt-ms") { params.prompt_ms = std::stoi(argv[++i]); }
else if (arg == "-cms" || arg == "--command-ms") { params.command_ms = std::stoi(argv[++i]); }
else if (arg == "-c" || arg == "--capture") { params.capture_id = std::stoi(argv[++i]); }
else if (arg == "-mt" || arg == "--max-tokens") { params.max_tokens = std::stoi(argv[++i]); }
else if (arg == "-ac" || arg == "--audio-ctx") { params.audio_ctx = std::stoi(argv[++i]); }
else if (arg == "-vth" || arg == "--vad-thold") { params.vad_thold = std::stof(argv[++i]); }
else if (arg == "-fth" || arg == "--freq-thold") { params.freq_thold = std::stof(argv[++i]); }
else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pe" || arg == "--print-energy") { params.print_energy = true; }
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params) {
fprintf(stderr, "\n");
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help [default] show this help message and exit\n");
fprintf(stderr, " -t N, --threads N [%-7d] number of threads to use during computation\n", params.n_threads);
fprintf(stderr, " -pms N, --prompt-ms N [%-7d] prompt duration in milliseconds\n", params.prompt_ms);
fprintf(stderr, " -cms N, --command-ms N [%-7d] command duration in milliseconds\n", params.command_ms);
fprintf(stderr, " -c ID, --capture ID [%-7d] capture device ID\n", params.capture_id);
fprintf(stderr, " -mt N, --max-tokens N [%-7d] maximum number of tokens per audio chunk\n", params.max_tokens);
fprintf(stderr, " -ac N, --audio-ctx N [%-7d] audio context size (0 - all)\n", params.audio_ctx);
fprintf(stderr, " -vth N, --vad-thold N [%-7.2f] voice activity detection threshold\n", params.vad_thold);
fprintf(stderr, " -fth N, --freq-thold N [%-7.2f] high-pass frequency cutoff\n", params.freq_thold);
fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pe, --print-energy [%-7s] print sound energy (for debugging)\n", params.print_energy ? "true" : "false");
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, "\n");
}
uint64_t wait_for_vad(audio_async & audio, json jparams, const whisper_params & params, uint64_t maxlength_ms, std::vector<float> & pcmf32) {
using namespace std::chrono;
uint64_t time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
uint64_t start_time = time_now;
if (jparams.contains("timestamp")) {
start_time = jparams.at("timestamp");
}
if(time_now - start_time < 500) {
//wait for a backlog of audio
std::this_thread::sleep_for(milliseconds(500 - (time_now - start_time)));
time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
} else if (time_now - start_time > 1000) {
audio.get(time_now-start_time, pcmf32);
size_t max_offset = pcmf32.size() - WHISPER_SAMPLE_RATE;
for(size_t offset=0;offset < max_offset;offset+=WHISPER_SAMPLE_RATE/10) {
std::vector<float> audio_chunk(&pcmf32[offset], &pcmf32[offset+WHISPER_SAMPLE_RATE]);
if(::vad_simple(audio_chunk, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
pcmf32.resize(offset+WHISPER_SAMPLE_RATE);
if (offset*1000/WHISPER_SAMPLE_RATE+1000 > maxlength_ms) {
//remove samples from the beginning
pcmf32.erase(pcmf32.begin(),pcmf32.end()-(maxlength_ms*WHISPER_SAMPLE_RATE/1000));
fprintf(stderr, "Shortened samples");
}
return start_time + offset*1000/WHISPER_SAMPLE_RATE+1000;
}
}
}
size_t window_duration = std::max((uint64_t)1000, time_now-start_time);
audio.get(window_duration, pcmf32);
while (!::vad_simple(pcmf32, WHISPER_SAMPLE_RATE, 1000, params.vad_thold, params.freq_thold, params.print_energy)) {
std::this_thread::sleep_for(milliseconds(100));
time_now = time_point_cast<milliseconds>(system_clock::now()).time_since_epoch().count();
window_duration = std::max((uint64_t)1000,time_now-start_time);
audio.get(window_duration, pcmf32);
}
if (time_now - start_time > maxlength_ms) {
audio.get(maxlength_ms, pcmf32);
} else {
audio.get(time_now - start_time, pcmf32);
}
return time_now;
}
json unguided_transcription(struct whisper_context * ctx, audio_async &audio, json jparams, const whisper_params &params) {
std::vector<whisper_token> prompt_tokens;
std::vector<float> pcmf32;
uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 10000U, pcmf32);
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
if (jparams.contains("prompt")) {
// unlikely to see much use. Under normal circumstances, no_context would be set to false
std::string prompt = jparams.at("prompt");
prompt_tokens.resize(1024);
int n = whisper_tokenize(ctx, prompt.c_str(), prompt_tokens.data(), 1024);
prompt_tokens.resize(n);
wparams.prompt_tokens = prompt_tokens.data();
wparams.prompt_n_tokens = prompt_tokens.size();
}
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = false;
wparams.translate = params.translate;
wparams.no_context = jparams.value("no_context", true);
wparams.single_segment = true;
wparams.max_tokens = params.max_tokens;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
wparams.suppress_non_speech_tokens = true;
// run the transformer and a single decoding pass
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
throw json{
{"code", -32803},
{"message", "ERROR: whisper_full() failed"}
};
}
std::string result = whisper_full_get_segment_text(ctx,0);
return json {
{"transcription", result},
{"timestamp", unprocessed_audio_timestamp}
};
}
// command-list mode
// guide the transcription to match the most likely command from a provided list
json guided_transcription(struct whisper_context * ctx, audio_async &audio, const whisper_params &params, json jparams, std::vector<struct commandset> commandset_list) {
struct commandset cs = commandset_list[jparams.value("commandset_index", commandset_list.size()-1)];
std::vector<float> pcmf32;
uint64_t unprocessed_audio_timestamp = wait_for_vad(audio, jparams, params, 2000U, pcmf32);
fprintf(stderr, "%s: Speech detected! Processing ...\n", __func__);
whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
wparams.print_progress = false;
wparams.print_special = params.print_special;
wparams.print_realtime = false;
wparams.print_timestamps = false;
wparams.translate = params.translate;
wparams.no_context = true;
wparams.single_segment = true;
wparams.max_tokens = 1;
wparams.language = params.language.c_str();
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
// TODO: Do some time testing. Does an overly long prompt slow down processing?
// Set up command sets/precompute prompts
wparams.prompt_tokens = cs.prompt_tokens.data();
wparams.prompt_n_tokens = cs.prompt_tokens.size();
// TODO: properly expose as option
wparams.suppress_non_speech_tokens = true;
// run the transformer and a single decoding pass
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
throw json{
{"code", -32803},
{"message", "ERROR: whisper_full() failed"}//TODO: format string (sprintf?)
};
}
// estimate command probability
// NOTE: not optimal
{
const auto * logits = whisper_get_logits(ctx);
std::vector<float> probs(whisper_n_vocab(ctx), 0.0f);
// compute probs from logits via softmax
{
float max = -1e9;
for (int i = 0; i < (int) probs.size(); ++i) {
max = std::max(max, logits[i]);
}
float sum = 0.0f;
for (int i = 0; i < (int) probs.size(); ++i) {
probs[i] = expf(logits[i] - max);
sum += probs[i];
}
for (int i = 0; i < (int) probs.size(); ++i) {
probs[i] /= sum;
}
}
std::vector<std::pair<float, int>> probs_id;
// In my testing, the most verbose token is always the desired.
// TODO: Trim commandset struct once efficacy has been verified
for (int i = 0; i < (int) cs.commands.size(); ++i) {
probs_id.emplace_back(probs[cs.commands[i].tokens[0]], i);
}
// sort descending
{
using pair_type = decltype(probs_id)::value_type;
std::sort(probs_id.begin(), probs_id.end(), [](const pair_type & a, const pair_type & b) {
return a.first > b.first;
});
}
int id = probs_id[0].second;
return json{
{"command_index", id},
{"command_text", cs.commands[id].plaintext},
{"timestamp", unprocessed_audio_timestamp},
};
}
}
json register_commandset(struct whisper_context * ctx, json jparams, std::vector<struct commandset> &commandset_list) {
// TODO: check for token collision
struct commandset cs;
std::string k_prompt = " select one from the available words: ";
std::set<whisper_token> token_set;
whisper_token tokens[32];
for (std::string s : jparams) {
std::vector<whisper_token> token_vec;
// The existing command implementation uses a nested for loop to tokenize single characters
// I fail to see the purpose of this when ' a' has a wholly different pronunciation than the start of ' apple'
const int n = whisper_tokenize(ctx, (" " + s).c_str(), tokens, 32);
if (n < 0) {
fprintf(stderr, "%s: error: failed to tokenize command '%s'\n", __func__, s.c_str());
return 3;
}
token_vec.push_back(tokens[0]);
if (!token_set.insert(tokens[0]).second) {
fprintf(stderr, "%s: warning: %s is a duplicate of an existing token\n", __func__, s.c_str());
throw json{
{"code",-31000},
{"message", "Duplicate token in token set: " + s}
};
}
if (n > 1) {// empty string if n=0? Should never occur
fprintf(stderr, "%s: error: command is more than a single token: %s\n", __func__, s.c_str());
}
struct command command = {token_vec, s};
cs.commands.push_back(command);
k_prompt += s;
}
k_prompt = k_prompt.substr(0,k_prompt.length()-2) + ". Selected word:";
cs.prompt_tokens.resize(1024);
int n = whisper_tokenize(ctx, k_prompt.c_str(), cs.prompt_tokens.data(), 1024);
cs.prompt_tokens.resize(n);
// prepare response
int index = commandset_list.size();
commandset_list.push_back(cs);
return json{{"index",index}};
}
json seek(struct whisper_context * ctx, audio_async &audio, json params) {
// whisper_state has the pertinent offsets, but there also seem to be a large
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
// I'll give this a another pass once everything else is implemented,
// but for now, it's unsupported
throw json{
{"code", -32601},
{"message", "Seeking is not yet supported."}
};
}
json parse_job(const json &body, struct whisper_context * ctx, audio_async &audio, const whisper_params &params, std::vector<struct commandset> &commandset_list) {
// See: https://www.jsonrpc.org/specification
json id = body.at("id");
try {
std::string version = body.at("jsonrpc");
if (version != "2.0") {
// unsupported version
throw json{
{"code", -3260},
{"message", "invalid jsonrpc version"}
};
}
std::string method = body.at("method");
json jparams = json{{"dummy", "dummy"}};
if (body.contains("params"))
jparams = body.at("params");
json res;
// TODO: be consistent about argument order
fprintf(stderr, "Dispatching a job\n");
if (method == "unguided") { res = unguided_transcription(ctx, audio, jparams, params); }
else if (method == "guided") { res = guided_transcription(ctx, audio, params, jparams, commandset_list); }
else if (method == "seek") { res = seek(ctx, audio, jparams); }
else if (method == "registerCommandset") { res = register_commandset(ctx, jparams, commandset_list); }
else if (method == "echo") { res = jparams; }
return json{
{"jsonrpc", "2.0"},
{"result", res},
{"id", id}
};
} catch(json ex) {
return json {
{"jsonrpc", "2.0"},
{"error", ex},
{"id", id}
};
}
}
void process_loop(struct whisper_context * ctx, audio_async &audio, const whisper_params &params) {
std::deque<json> jobqueue;
std::vector<struct commandset> commandset_list;
while (true) {
// For eventual cancellation support, shouldn't block if job exists
if (std::cin.rdbuf()->in_avail() > 22 || jobqueue.size() == 0) {
int content_length;
if (scanf("Content-Length: %d", &content_length) != 1) {
fprintf(stderr, "Could not read input: %d", std::cin.peek());
return;
}
// scanf leaves the new lines intact
std::cin.ignore(2);
if (std::cin.peek() != 13) {
// Content-Type. jsonrpc necessitates utf8.
std::cin.ignore(200,10);
}
std::cin.ignore(2);
// A message is being sent and blocking is acceptable
std::string content(content_length,'\0');
std::cin.read(&content[0], content_length);
json job = json::parse(content);
// TODO: Some messages(cancellation) should skip queue here
if (job.is_array()) {
// response must also be batched. Will implement later
// for (subjob : job.begin())
// TODO: At the very least respond with an unsupported error.
} else {
jobqueue.push_back(job);
}
}
assert(jobqueue.size() > 0);
json job = jobqueue.front();
json resp = parse_job(job, ctx, audio, params, commandset_list);
if (resp != "unfinished") {
jobqueue.pop_front();
// send response
std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
std::cout.flush();
}
}
}
int main(int argc, char ** argv) {
whisper_params params;
if (whisper_params_parse(argc, argv, params) == false) {
return 1;
}
if (whisper_lang_id(params.language.c_str()) == -1) {
fprintf(stderr, "error: unknown language '%s'\n", params.language.c_str());
whisper_print_usage(argc, argv, params);
exit(0);
}
// whisper init
struct whisper_context * ctx = whisper_init_from_file(params.model.c_str());
// init audio
audio_async audio(30*1000);
if (!audio.init(params.capture_id, WHISPER_SAMPLE_RATE)) {
fprintf(stderr, "%s: audio.init() failed!\n", __func__);
return 1;
}
audio.resume();
// TODO: Investigate why this is required. An extra second of startup latency is not great
// wait for 1 second to avoid any buffered noise
std::this_thread::sleep_for(std::chrono::milliseconds(1000));
audio.clear();
// TODO: consider some sort of indicator to designate loading has finished?
// Potentially better for the client to just start with a non-blocking message (register commands)
process_loop(ctx, audio, params);
audio.pause();
whisper_print_timings(ctx);
whisper_free(ctx);
return 0;
}

View File

@ -1,362 +0,0 @@
if !exists("g:whisper_dir")
let g:whisper_dir = expand($WHISPER_CPP_HOME)
if g:whisper_dir == ""
echoerr "Please provide a path to the whisper.cpp repo in either the $WHISPER_CPP_HOME environment variable, or g:whisper_dir"
endif
endif
if !exists("g:whisper_lsp_path")
let g:whisper_lsp_path = g:whisper_dir .. "lsp"
if !filereadable(g:whisper_lsp_path)
echoerr "Was not able to locate a lsp executable at: " .. g:whisper_lsp_path
throw "Executable not found"
endif
endif
if !exists("g:whisper_model_path")
" TODO: allow custom paths relative to the repo dir
let g:whisper_model_path = g:whisper_dir .. "models/ggml-base.en.bin"
if !filereadable(g:whisper_model_path)
echoerr "Could not find model at: " .. g:whisper_model_path
throw "Model not found"
endif
endif
let s:output_buffer = bufnr("whisper_log", v:true)
call setbufvar(s:output_buffer,"&buftype","nofile")
let s:lsp_command = [g:whisper_lsp_path,"-m",g:whisper_model_path]
" For faster execution. TODO: server load multiple models/run multiple servers?
" let s:lsp_command = [g:whisper_lsp_path, "-m", g:whisper_dir .. "models/ggml-tiny.en.bin", "-ac", "128"]
" requestCommands([params_dict])
func whisper#requestCommands(...)
let l:req = {"method": "guided", "params": {"commandset_index": 0}}
if a:0 > 0
call extend(l:req.params, a:1)
endif
let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback", [l:req.params, 0])})
endfunction
" doTranscription([params_dict])
func whisper#doTranscription(...)
let l:req = {"method": "unguided", "params": {}}
if a:0 > 0
call extend(l:req.params, a:1)
endif
let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:transcriptionCallback", [function("s:insertText"),function("s:endTranscription")])})
endfunction
" For testing
func whisper#uppertest(cha)
echo tr(a:cha, s:c_lowerkeys, s:c_upperkeys)
endfunction
" (upper, exit, count, motion, command, insert/append, save run) "base"
" (upper, exit, count, motion, command, inside/around) "motion/visual"
" (upper, exit, count, motion, line, inside/around) "command already entered"
" (upper, exit, key, ) "from/till"
" upper and lower keys is used to translate between cases with tr
" Must be sunchronized
let s:c_lowerkeys = "1234567890-=qwertyuiop[]\\asdfghjkl;'zxcvbnm,./\""
let s:c_upperkeys = "!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?'"
let s:c_count = split("1234567890\"",'\zs')
let s:c_command = split("ryuogpdxcv.iam", '\zs')
let s:c_motion = split("wetf'hjklnb$^)",'\zs')
" object words: Word, Sentence, Paragraph, [, (, <, Tag, {. ", '
let s:c_area = split("wsp])>t}\"'",'\zs')
"Special commands.
let s:c_special_always = ["exit", "upper"]
let s:c_special_normal = ["save", "run", "space"]
" If not in dict, key is spoken word,
" If key resolves to string, value is used for normal/motion, but key for chars
" If key resolves to dict, {0: "normal",1: "motion",2:"single char",3: "area"}
" Missing entries fall back as follows {0: "required", 1: 0, 2: "key", 3: 0}
let s:spoken_dict = {"w": "word", "e": "end", "r": "replace", "t": {0: "till", 3: "tag"}, "y": "yank", "u": "undo", "i": {0: "insert", 1: "inside"}, "o": "open", "p": {0: "paste", 3: "paragraph"}, "a": {0: "append", 1: "around"}, "s": {0: "substitute", 3: "sentence"}, "d": "delete", "f": "from", "g": "go", "h": "left", "j": "down", "k": "up", "l": "right", "c": "change", "v": "visual", "b": "back", "n": "next", "m": "mark", ".": {0: "repeat", 2: "period"}, "]": {0: "bracket", 2: "bracket"}, "'": {0: "jump", 2: "apostrophe", 3: "apostrophe"}, '"': {0: 'register', 2: "quotation", 3: "quotation"}, "-": {0: "minus", 2: "minus"}, "$": {0: "dollar", 2: "dollar"}, "^": {0: "carrot", 2: "carrot"}, ")": {0: "sentence", 2: "parenthesis", 3: "parenthesis"}, "}": {0: "paragraph", 2: "brace", 3: "brace"}, ">": {0: "indent", 2: "angle", 3: "angle"}}
" Give this another pass. This seems overly hacky even if it's functional
let s:sub_tran_msg = ""
func s:subTranProg(msg)
if s:sub_tran_msg != ""
let s:sub_tran_msg = s:sub_tran_msg .. a:msg
if mode() !=? 'v'
exe "normal" "u" .. s:sub_tran_msg
endif
else
if s:command_backlog == ""
" this should not occur
call s:logCallback(0, "Warning: Encountered sub transcription without prior command")
let s:command_backlog = "a"
endif
if a:msg[0] == ' '
let s:sub_tran_msg = s:command_backlog .. a:msg[1:-1]
else
let s:sub_tran_msg = s:command_backlog .. a:msg
endif
if mode() !=? 'v'
exe "normal" s:sub_tran_msg
endif
endif
call appendbufline(s:output_buffer, "$", s:sub_tran_msg .. ":" .. string(a:msg ))
endfunction
func s:subTranFinish(params, timestamp)
let s:repeat_command = s:sub_tran_msg
" Visual selection is lot if used with streaming, so streaming of partial
" transcriptions is disabled in visual mode
if mode() ==? 'v'
exe "normal" s:sub_tran_msg
endif
let s:sub_tran_msg = ""
let s:command_backlog = ""
exe "normal a\<C-G>u"
let l:params = a:params
let l:params.timestamp = a:timestamp
if exists("l:params.commandset_index")
unlet l:params.commandset_index
endif
call whisper#requestCommands(a:params)
endfunction
func s:logCallback(channel, msg)
call appendbufline(s:output_buffer,"$",a:msg)
endfunction
func s:transcriptionCallback(progressCallback, finishedCallback, channel, msg)
let l:tr = a:msg.result.transcription
let l:ex_ind = match(tolower(l:tr),"exit", len(l:tr)-6)
" The worst case I've observed so far is " Exit.", which is 6 characters
if l:ex_ind != -1
call a:progressCallback(strpart(l:tr,0,l:ex_ind-1))
call a:finishedCallback(a:msg.result.timestamp)
else
call a:progressCallback(l:tr)
let req = {"method": "unguided", "params": {"timestamp": a:msg.result.timestamp, "no_context": v:true}}
let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [a:progressCallback, a:finishedCallback])})
endif
endfunc
func s:insertText(msg)
exe "normal a" .. a:msg
endfunction
func s:endTranscription(timestamp)
call appendbufline(s:output_buffer, "$", "Ending unguided transcription")
endfunction
" If a command does not include a whole actionable step, attempting to execute
" it discards the remainder of things. There is likely a simpler solution,
" but it can be made functional now by storing a backbuffer until actionable
let s:command_backlog = ""
let s:repeat_command = ""
let s:preceeding_upper = v:false
func s:commandCallback(params, commandset_index, channel, msg)
let l:command_index = a:msg.result.command_index
let l:do_execute = v:false
let l:next_mode = a:commandset_index
let l:command = s:commandset_list[a:commandset_index][l:command_index]
call s:logCallback(0, string(a:msg) .. " " .. a:commandset_index .. " " .. l:command)
if l:command_index == 0
"exit
"if s:command_backlog == ""
call s:logCallback(0,"Stopping command mode")
echo "No longer listening"
let s:command_backlog = ""
return
"else
" Legacy code to clear an existing buffer with exit.
" Was found to be rarely desired and is better introduced as a
" standalone command (clear?)
" call s:logCallback(0,"Clearing command_backlog" .. s:command_backlog)
" let s:command_backlog = ""
" let s:preceeding_upper = v:false
" endif
elseif l:command_index == 1
" upper
let s:preceeding_upper = !s:preceeding_upper
elseif l:command == "save"
" save and run can only happen in commandset 0,
exe "w"
elseif l:command == "run"
exe "make run"
elseif l:command == "space"
exe "normal i \<ESC>l"
elseif has_key(s:c_user, l:command)
let Userfunc = s:c_user[l:command]
if type(Userfunc) == v:t_string
let Userfunc = function(Userfunc)
endif
call Userfunc()
else
if s:preceeding_upper
" Upper should keep commandset
let s:preceeding_upper = v:false
let l:visual_command = tr(l:command, s:c_lowerkeys, s:c_upperkeys)
else
let l:visual_command = l:command
endif
echo s:command_backlog .. " - " .. l:visual_command
let s:command_backlog = s:command_backlog .. l:visual_command
if a:commandset_index == 2 || a:commandset_index == 3
" single key, either completes motion, replace, or register
" Should move to execute unless part of a register
" Change will be caught at execute
if s:command_backlog[-2:-2] !=# '"'
call s:logCallback(0,"not register")
let l:do_execute = v:true
end
let l:next_mode = 0
" commandset index only matters for a/i
elseif (l:command == "a" || l:command == "i") && a:commandset_index == 1
" inside/around. Is commandset 3
let l:next_mode = 3
elseif l:command ==# '"'
let l:next_mode = 2
elseif index(s:c_count, l:command) != -1
let l:next_mode = a:commandset_index
elseif index(s:c_motion, l:command) != -1
if l:command == 't' || l:command == 'f' || l:command == "'"
" prompt single key
let l:next_mode = 2
else
let l:do_execute = v:true
let l:next_mode = 0
endif
elseif index(s:c_command, l:command) != -1
if index(["y","g","d","c"], s:command_backlog[-1:-1]) != -1 && s:command_backlog[-1:-1] != s:command_backlog[-2:-2] && mode() !=? 'v'
" need motion or repeated command
" Potential for bad state here if disparaging command keys are
" entered (i.e. yd), but vim can handle checks for this at exe
" And checking for cases like y123d would complicate things
let l:next_mode = 1
elseif index(["i","a","c", "o", "s"], l:command) != -1 || s:command_backlog[-1:-1] ==# 'R'
"'Insert' mode, do general transcription
let l:req = {"method": "unguided", "params": a:params}
let l:req.params.timestamp = a:msg.result.timestamp
let l:req.params.no_context = v:true
let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
return
elseif l:command == 'r' || l:command == 'm'
let l:next_mode = 2
elseif l:command == '.'
let l:next_mode = 0
let l:do_execute = v:true
let s:command_backlog = s:command_backlog[0:-2] .. s:repeat_command
else
if l:command ==? 'v'
let l:next_mode = 1
else
let l:next_mode = 0
endif
let l:do_execute = v:true
endif
else
throw "Invalid command state: " .. l:command .. " " .. a:commandset_index .. " " .. s:command_backlog
endif
endif
if l:do_execute
if mode() ==?'v' && l:next_mode == 0
let l:next_mode = 1
elseif match(s:command_backlog, 'c') != -1
let l:req = {"method": "unguided", "params": a:params}
let l:req.params.timestamp = a:msg.result.timestamp
let l:req.params.no_context = v:true
let resp = ch_sendexpr(g:lsp_job, req, {"callback": function("s:transcriptionCallback", [function("s:subTranProg"), function("s:subTranFinish", [a:params])])})
return
endif
exe "normal" s:command_backlog
if index(s:c_motion + ["u"],l:command) == -1
exe "normal a\<C-G>u"
let s:repeat_command = s:command_backlog
call s:logCallback(0, s:command_backlog)
endif
let s:command_backlog = ""
endif
let l:req = {"method": "guided", "params": a:params}
let l:req.params.timestamp = a:msg.result.timestamp
let l:req.params.commandset_index = l:next_mode
let resp = ch_sendexpr(g:lsp_job, l:req, {"callback": function("s:commandCallback",[a:params, l:next_mode])})
endfunction
func s:loadedCallback(channel, msg)
echo "Loading complete"
call s:logCallback(a:channel, a:msg)
endfunction
func s:registerCommandset(commandlist, is_final)
let req = {"method": "registerCommandset"}
let req.params = a:commandlist
call s:logCallback(0, join(a:commandlist))
call add(g:whisper_commandlist_spoken, a:commandlist)
if a:is_final
let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:loadedCallback"})
else
let resp = ch_sendexpr(g:lsp_job, req, {"callback": "s:logCallback"})
endif
endfunction
func s:registerAllCommands()
let l:normal = s:c_special_always + s:c_special_normal + s:c_count + s:c_command + s:c_motion + keys(s:c_user)
let l:visual = s:c_special_always + s:c_count + s:c_command + s:c_motion
" Currently the same as visual.
" let l:post_command = s:c_special_always + s:c_count + s:c_command + s:c_motion
let l:single_key = s:c_special_always + split(s:c_lowerkeys, '\zs')
let l:area = s:c_special_always + s:c_area
" Used only for compatibility with the testing script
let g:whisper_commandlist_spoken = []
let s:commandset_list = [l:normal, l:visual, l:single_key, l:area]
call s:registerCommandset(s:commandsetToSpoken(l:normal, 0), v:false)
call s:registerCommandset(s:commandsetToSpoken(l:visual, 1), v:false)
call s:registerCommandset(s:commandsetToSpoken(l:single_key, 2), v:false)
call s:registerCommandset(s:commandsetToSpoken(l:area, 3), v:true)
endfunction
func s:commandsetToSpoken(commandset, spoken_index)
let l:spoken_list = []
for l:command in a:commandset
if has_key(s:spoken_dict, l:command)
let l:spoken_value = s:spoken_dict[l:command]
if type(l:spoken_value) == v:t_dict
if has_key(l:spoken_value, a:spoken_index)
let l:spoken_value = l:spoken_value[a:spoken_index]
else
if a:spoken_index == 2
let l:spoken_value = l:command
else
let l:spoken_value = l:spoken_value[0]
endif
endif
else
if a:spoken_index == 2
let l:spoken_value = l:command
endif
endif
else
let l:spoken_value = l:command
endif
call add(l:spoken_list, l:spoken_value)
endfor
return l:spoken_list
endfunction
" TODO: Check lifetime. If the script is resourced, is the existing
" s:lsp_job dropped and therefore killed?
" This seems to not be the case and I've had to deal with zombie processes
" that survive exiting vim, even though said behavior conflicts with my
" understanding of the provided documentation
let s:lsp_opts = {"in_mode": "lsp", "out_mode": "lsp", "err_mode": "nl", "err_io": "buffer", "err_buf": s:output_buffer}
if !exists("g:lsp_job")
if exists("g:whisper_user_commands")
let s:c_user = g:whisper_user_commands
else
let s:c_user = {}
endif
let g:lsp_job = job_start(s:lsp_command, s:lsp_opts)
if job_status(g:lsp_job) == "fail"
echoerr "Failed to start whisper job"
endif
call s:registerAllCommands()
endif

View File

@ -59,7 +59,6 @@ struct whisper_params {
int32_t offset_t_ms = 0; int32_t offset_t_ms = 0;
int32_t offset_n = 0; int32_t offset_n = 0;
int32_t duration_ms = 0; int32_t duration_ms = 0;
int32_t progress_step = 5;
int32_t max_context = -1; int32_t max_context = -1;
int32_t max_len = 0; int32_t max_len = 0;
int32_t best_of = 2; int32_t best_of = 2;
@ -70,7 +69,6 @@ struct whisper_params {
float logprob_thold = -1.00f; float logprob_thold = -1.00f;
bool speed_up = false; bool speed_up = false;
bool debug_mode = false;
bool translate = false; bool translate = false;
bool detect_language = false; bool detect_language = false;
bool diarize = false; bool diarize = false;
@ -88,7 +86,6 @@ struct whisper_params {
bool print_colors = false; bool print_colors = false;
bool print_progress = false; bool print_progress = false;
bool no_timestamps = false; bool no_timestamps = false;
bool log_score = false;
std::string language = "en"; std::string language = "en";
std::string prompt; std::string prompt;
@ -136,8 +133,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); } else if (arg == "-wt" || arg == "--word-thold") { params.word_thold = std::stof(argv[++i]); }
else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); } else if (arg == "-et" || arg == "--entropy-thold") { params.entropy_thold = std::stof(argv[++i]); }
else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); } else if (arg == "-lpt" || arg == "--logprob-thold") { params.logprob_thold = std::stof(argv[++i]); }
// else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; } else if (arg == "-su" || arg == "--speed-up") { params.speed_up = true; }
else if (arg == "-debug"|| arg == "--debug-mode") { params.debug_mode = true; }
else if (arg == "-tr" || arg == "--translate") { params.translate = true; } else if (arg == "-tr" || arg == "--translate") { params.translate = true; }
else if (arg == "-di" || arg == "--diarize") { params.diarize = true; } else if (arg == "-di" || arg == "--diarize") { params.diarize = true; }
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; } else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
@ -162,7 +158,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); } else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); }
else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; } else if (arg == "-oved" || arg == "--ov-e-device") { params.openvino_encode_device = argv[++i]; }
else if (arg == "-ls" || arg == "--log-score") { params.log_score = true; }
else { else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params); whisper_print_usage(argc, argv, params);
@ -192,8 +187,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold); fprintf(stderr, " -wt N, --word-thold N [%-7.2f] word timestamp probability threshold\n", params.word_thold);
fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold); fprintf(stderr, " -et N, --entropy-thold N [%-7.2f] entropy threshold for decoder fail\n", params.entropy_thold);
fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold); fprintf(stderr, " -lpt N, --logprob-thold N [%-7.2f] log probability threshold for decoder fail\n", params.logprob_thold);
// fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false"); fprintf(stderr, " -su, --speed-up [%-7s] speed up audio by x2 (reduced accuracy)\n", params.speed_up ? "true" : "false");
fprintf(stderr, " -debug, --debug-mode [%-7s] enable debug mode (eg. dump log_mel)\n", params.debug_mode ? "true" : "false");
fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false"); fprintf(stderr, " -tr, --translate [%-7s] translate from source language to english\n", params.translate ? "true" : "false");
fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false"); fprintf(stderr, " -di, --diarize [%-7s] stereo audio diarization\n", params.diarize ? "true" : "false");
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false"); fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
@ -217,7 +211,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", ""); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", "");
fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str()); fprintf(stderr, " -oved D, --ov-e-device DNAME [%-7s] the OpenVINO device used for encode inference\n", params.openvino_encode_device.c_str());
fprintf(stderr, " -ls, --log-score [%-7s] log best decoder scores of tokens\n", params.log_score?"true":"false");
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
@ -225,7 +218,6 @@ struct whisper_print_user_data {
const whisper_params * params; const whisper_params * params;
const std::vector<std::vector<float>> * pcmf32s; const std::vector<std::vector<float>> * pcmf32s;
int progress_prev;
}; };
std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) { std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
@ -260,14 +252,6 @@ std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s
return speaker; return speaker;
} }
void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
if (progress >= *progress_prev + progress_step) {
*progress_prev += progress_step;
fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress);
}
}
void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) { void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int n_new, void * user_data) {
const auto & params = *((whisper_print_user_data *) user_data)->params; const auto & params = *((whisper_print_user_data *) user_data)->params;
@ -492,25 +476,6 @@ bool output_csv(struct whisper_context * ctx, const char * fname, const whisper_
return true; return true;
} }
bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
std::ofstream fout(fname);
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
const int n_segments = whisper_full_n_segments(ctx);
// fprintf(stderr,"segments: %d\n",n_segments);
for (int i = 0; i < n_segments; ++i) {
const int n_tokens = whisper_full_n_tokens(ctx, i);
// fprintf(stderr,"tokens: %d\n",n_tokens);
for (int j = 0; j < n_tokens; j++) {
auto token = whisper_full_get_token_text(ctx, i, j);
auto probability = whisper_full_get_token_p(ctx, i, j);
fout << token << '\t' << probability << std::endl;
// fprintf(stderr,"token: %s %f\n",token,probability);
}
}
return true;
}
bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) { bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
std::ofstream fout(fname); std::ofstream fout(fname);
int indent = 0; int indent = 0;
@ -918,7 +883,6 @@ int main(int argc, char ** argv) {
wparams.split_on_word = params.split_on_word; wparams.split_on_word = params.split_on_word;
wparams.speed_up = params.speed_up; wparams.speed_up = params.speed_up;
wparams.debug_mode = params.debug_mode;
wparams.tdrz_enable = params.tinydiarize; // [TDRZ] wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
@ -931,7 +895,7 @@ int main(int argc, char ** argv) {
wparams.entropy_thold = params.entropy_thold; wparams.entropy_thold = params.entropy_thold;
wparams.logprob_thold = params.logprob_thold; wparams.logprob_thold = params.logprob_thold;
whisper_print_user_data user_data = { &params, &pcmf32s, 0 }; whisper_print_user_data user_data = { &params, &pcmf32s };
// this callback is called on each new segment // this callback is called on each new segment
if (!wparams.print_realtime) { if (!wparams.print_realtime) {
@ -939,11 +903,6 @@ int main(int argc, char ** argv) {
wparams.new_segment_callback_user_data = &user_data; wparams.new_segment_callback_user_data = &user_data;
} }
if (wparams.print_progress) {
wparams.progress_callback = whisper_print_progress_callback;
wparams.progress_callback_user_data = &user_data;
}
// example for abort mechanism // example for abort mechanism
// in this example, we do not abort the processing, but we could if the flag is set to true // in this example, we do not abort the processing, but we could if the flag is set to true
// the callback is called before every encoder run - if it returns false, the processing is aborted // the callback is called before every encoder run - if it returns false, the processing is aborted
@ -1008,12 +967,6 @@ int main(int argc, char ** argv) {
const auto fname_lrc = fname_out + ".lrc"; const auto fname_lrc = fname_out + ".lrc";
output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s); output_lrc(ctx, fname_lrc.c_str(), params, pcmf32s);
} }
// output to score file
if (params.log_score) {
const auto fname_score = fname_out + ".score.txt";
output_score(ctx, fname_score.c_str(), params, pcmf32s);
}
} }
} }

View File

@ -138,7 +138,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
// return false; // return false;
//} //}
char word[129]; char word[128];
for (int i = 0; i < n_vocab; i++) { for (int i = 0; i < n_vocab; i++) {
uint32_t len; uint32_t len;

View File

@ -47,7 +47,6 @@ struct whisper_params {
bool print_special = false; bool print_special = false;
bool no_context = true; bool no_context = true;
bool no_timestamps = false; bool no_timestamps = false;
bool tinydiarize = false;
std::string language = "en"; std::string language = "en";
std::string model = "models/ggml-base.en.bin"; std::string model = "models/ggml-base.en.bin";
@ -81,8 +80,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; }
else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; }
else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; } else if (arg == "-f" || arg == "--file") { params.fname_out = argv[++i]; }
else if (arg == "-tdrz" || arg == "--tinydiarize") { params.tinydiarize = true; }
else { else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str()); fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params); whisper_print_usage(argc, argv, params);
@ -116,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str()); fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language\n", params.language.c_str());
fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str());
fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str()); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] text output file name\n", params.fname_out.c_str());
fprintf(stderr, " -tdrz, --tinydiarize [%-7s] enable tinydiarize (requires a tdrz model)\n", params.tinydiarize ? "true" : "false");
fprintf(stderr, "\n"); fprintf(stderr, "\n");
} }
@ -303,8 +299,6 @@ int main(int argc, char ** argv) {
wparams.audio_ctx = params.audio_ctx; wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up; wparams.speed_up = params.speed_up;
wparams.tdrz_enable = params.tinydiarize; // [TDRZ]
// disable temperature fallback // disable temperature fallback
//wparams.temperature_inc = -1.0f; //wparams.temperature_inc = -1.0f;
wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc; wparams.temperature_inc = params.no_fallback ? 0.0f : wparams.temperature_inc;
@ -350,19 +344,10 @@ int main(int argc, char ** argv) {
const int64_t t0 = whisper_full_get_segment_t0(ctx, i); const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i); const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
std::string output = "[" + to_timestamp(t0) + " --> " + to_timestamp(t1) + "] " + text; printf ("[%s --> %s] %s\n", to_timestamp(t0).c_str(), to_timestamp(t1).c_str(), text);
if (whisper_full_get_segment_speaker_turn_next(ctx, i)) {
output += " [SPEAKER_TURN]";
}
output += "\n";
printf("%s", output.c_str());
fflush(stdout);
if (params.fname_out.length() > 0) { if (params.fname_out.length() > 0) {
fout << output; fout << "[" << to_timestamp(t0) << " --> " << to_timestamp(t1) << "] " << text << std::endl;
} }
} }
} }

View File

@ -18,9 +18,6 @@ android {
vectorDrawables { vectorDrawables {
useSupportLibrary true useSupportLibrary true
} }
ndk {
abiFilters 'arm64-v8a', 'armeabi-v7a', 'x86', 'x86_64'
}
} }
buildTypes { buildTypes {
@ -45,8 +42,8 @@ android {
} }
ndkVersion "25.1.8937393" ndkVersion "25.1.8937393"
externalNativeBuild { externalNativeBuild {
cmake { ndkBuild {
path = file("src/main/jni/whisper/CMakeLists.txt") path 'src/main/jni/whisper/Android.mk'
} }
} }
packagingOptions { packagingOptions {

View File

@ -0,0 +1,26 @@
LOCAL_PATH := $(call my-dir)
include $(CLEAR_VARS)
LOCAL_MODULE := libwhisper
include $(LOCAL_PATH)/Whisper.mk
include $(BUILD_SHARED_LIBRARY)
ifeq ($(TARGET_ARCH_ABI),armeabi-v7a)
include $(CLEAR_VARS)
LOCAL_MODULE := libwhisper_vfpv4
include $(LOCAL_PATH)/Whisper.mk
# Allow building NEON FMA code.
# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
LOCAL_CFLAGS += -mfpu=neon-vfpv4
include $(BUILD_SHARED_LIBRARY)
endif
ifeq ($(TARGET_ARCH_ABI),arm64-v8a)
include $(CLEAR_VARS)
LOCAL_MODULE := libwhisper_v8fp16_va
include $(LOCAL_PATH)/Whisper.mk
# Allow building NEON FMA code.
# https://android.googlesource.com/platform/ndk/+/master/sources/android/cpufeatures/cpu-features.h
LOCAL_CFLAGS += -march=armv8.2-a+fp16
include $(BUILD_SHARED_LIBRARY)
endif

View File

@ -0,0 +1 @@
APP_STL := c++_static

View File

@ -1,53 +0,0 @@
cmake_minimum_required(VERSION 3.10)
project(whisper.cpp)
set(CMAKE_CXX_STANDARD 11)
set(WHISPER_LIB_DIR ${CMAKE_SOURCE_DIR}/../../../../../../../)
set(
SOURCE_FILES
${WHISPER_LIB_DIR}/ggml.c
${WHISPER_LIB_DIR}/whisper.cpp
${CMAKE_SOURCE_DIR}/jni.c
)
find_library(LOG_LIB log)
function(build_library target_name)
add_library(
${target_name}
SHARED
${SOURCE_FILES}
)
target_link_libraries(${target_name} ${LOG_LIB} android)
if (${target_name} STREQUAL "whisper_v8fp16_va")
target_compile_options(${target_name} PRIVATE -march=armv8.2-a+fp16)
elseif (${target_name} STREQUAL "whisper_vfpv4")
target_compile_options(${target_name} PRIVATE -mfpu=neon-vfpv4)
endif ()
if (NOT ${CMAKE_BUILD_TYPE} STREQUAL "Debug")
target_compile_options(${target_name} PRIVATE -O3)
target_compile_options(${target_name} PRIVATE -fvisibility=hidden -fvisibility-inlines-hidden)
target_compile_options(${target_name} PRIVATE -ffunction-sections -fdata-sections)
target_link_options(${target_name} PRIVATE -Wl,--gc-sections)
target_link_options(${target_name} PRIVATE -Wl,--exclude-libs,ALL)
target_link_options(${target_name} PRIVATE -flto)
endif ()
endfunction()
build_library("whisper") # Default target
if (${ANDROID_ABI} STREQUAL "arm64-v8a")
build_library("whisper_v8fp16_va")
elseif (${ANDROID_ABI} STREQUAL "armeabi-v7a")
build_library("whisper_vfpv4")
endif ()
include_directories(${WHISPER_LIB_DIR})

View File

@ -0,0 +1,18 @@
WHISPER_LIB_DIR := $(LOCAL_PATH)/../../../../../../../
LOCAL_LDLIBS := -landroid -llog
# Make the final output library smaller by only keeping the symbols referenced from the app.
ifneq ($(APP_OPTIM),debug)
LOCAL_CFLAGS += -O3
LOCAL_CFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden
LOCAL_CFLAGS += -ffunction-sections -fdata-sections
LOCAL_LDFLAGS += -Wl,--gc-sections
LOCAL_LDFLAGS += -Wl,--exclude-libs,ALL
LOCAL_LDFLAGS += -flto
endif
LOCAL_CFLAGS += -DSTDC_HEADERS -std=c11 -I $(WHISPER_LIB_DIR)
LOCAL_CPPFLAGS += -std=c++11
LOCAL_SRC_FILES := $(WHISPER_LIB_DIR)/ggml.c \
$(WHISPER_LIB_DIR)/whisper.cpp \
$(LOCAL_PATH)/jni.c

View File

@ -6,60 +6,9 @@
#include <atomic> #include <atomic>
#include <assert.h> #include <assert.h>
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#include <hipblas/hipblas.h>
#include <hip/hip_fp16.h>
#include <rocblas/rocblas.h>
#define CUBLAS_OP_N HIPBLAS_OP_N
#define CUBLAS_OP_T HIPBLAS_OP_T
#define CUBLAS_STATUS_SUCCESS HIPBLAS_STATUS_SUCCESS
#define CUBLAS_TF32_TENSOR_OP_MATH 0
#define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
#define cublasCreate hipblasCreate
#define cublasGetStatusString rocblas_status_to_string
#define cublasHandle_t hipblasHandle_t
#define cublasLoggerConfigure(logIsOn, logToStdOut, logToStdErr, logFileName) CUBLAS_STATUS_SUCCESS
#define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS
#define cublasSetStream hipblasSetStream
#define cublasSgemm hipblasSgemm
#define cublasStatus_t hipblasStatus_t
#define cudaDeviceProp hipDeviceProp_t
#define cudaDeviceSynchronize hipDeviceSynchronize
#define cudaError_t hipError_t
#define cudaEventCreateWithFlags hipEventCreateWithFlags
#define cudaEventDestroy hipEventDestroy
#define cudaEventDisableTiming hipEventDisableTiming
#define cudaEventRecord hipEventRecord
#define cudaEvent_t hipEvent_t
#define cudaFree hipFree
#define cudaFreeHost hipHostFree
#define cudaGetDevice hipGetDevice
#define cudaGetDeviceCount hipGetDeviceCount
#define cudaGetDeviceProperties hipGetDeviceProperties
#define cudaGetErrorString hipGetErrorString
#define cudaGetLastError hipGetLastError
#define cudaMalloc hipMalloc
#define cudaMallocHost hipHostMalloc
#define cudaMemcpy hipMemcpy
#define cudaMemcpy2DAsync hipMemcpy2DAsync
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyDeviceToDevice hipMemcpyDeviceToDevice
#define cudaMemcpyDeviceToHost hipMemcpyDeviceToHost
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaSetDevice hipSetDevice
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define cudaStreamNonBlocking hipStreamNonBlocking
#define cudaStreamWaitEvent(stream, event) hipStreamWaitEvent(stream, event, 0)
#define cudaStream_t hipStream_t
#define cudaSuccess hipSuccess
#else
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#include <cuda_fp16.h> #include <cuda_fp16.h>
#endif
#include "ggml-cuda.h" #include "ggml-cuda.h"
#include "ggml.h" #include "ggml.h"

View File

@ -653,13 +653,13 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128... const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
const int in = tid - step*im; // 0...15 or 0...7 const int in = tid - step*im; // 0...15 or 0...7
\n#if K_QUANTS_PER_ITERATION == 1\n #if K_QUANTS_PER_ITERATION == 1
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15 const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
const int is = 0; const int is = 0;
\n#else\n #else
const int l0 = 4 * in; // 0, 4, 8, ..., 28 const int l0 = 4 * in; // 0, 4, 8, ..., 28
const int is = in / 4; const int is = in / 4;
\n#endif\n #endif
const int ql_offset = 64*im + l0; const int ql_offset = 64*im + l0;
const int qh_offset = 32*im + l0; const int qh_offset = 32*im + l0;
const int s_offset = 8*im + is; const int s_offset = 8*im + is;
@ -676,7 +676,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
const float d = vload_half(0, &x[i].d); const float d = vload_half(0, &x[i].d);
\n#if K_QUANTS_PER_ITERATION == 1\n #if K_QUANTS_PER_ITERATION == 1
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32) float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32) + y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32) + y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
@ -686,7 +686,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32) + y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32); +y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
tmp[16 * ix + tid] += sum; tmp[16 * ix + tid] += sum;
\n#else\n #else
float sum = 0; float sum = 0;
for (int l = 0; l < 4; ++l) { for (int l = 0; l < 4; ++l) {
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32) sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
@ -695,7 +695,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32); + y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
} }
tmp[16 * ix + tid] += sum; tmp[16 * ix + tid] += sum;
\n#endif\n #endif
} }

23
ggml.c
View File

@ -292,7 +292,7 @@ typedef double ggml_float;
#if defined(_MSC_VER) || defined(__MINGW32__) #if defined(_MSC_VER) || defined(__MINGW32__)
#include <intrin.h> #include <intrin.h>
#else #else
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__) #if !defined(__riscv)
#include <immintrin.h> #include <immintrin.h>
#endif #endif
#endif #endif
@ -663,7 +663,7 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
} }
static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) { static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
#ifdef __AVXVNNI__ #if __AVXVNNI__
const __m256i zero = _mm256_setzero_si256(); const __m256i zero = _mm256_setzero_si256();
const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy); const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
return _mm256_cvtepi32_ps(summed_pairs); return _mm256_cvtepi32_ps(summed_pairs);
@ -676,7 +676,7 @@ static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy)
// multiply int8_t, add results pairwise twice and return as float vector // multiply int8_t, add results pairwise twice and return as float vector
static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) { static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
#ifdef __AVXVNNIINT8__ #if __AVXVNNIINT8__
const __m256i zero = _mm256_setzero_si256(); const __m256i zero = _mm256_setzero_si256();
const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y); const __m256i summed_pairs = _mm256_dpbssd_epi32(zero, x, y);
return _mm256_cvtepi32_ps(summed_pairs); return _mm256_cvtepi32_ps(summed_pairs);
@ -692,7 +692,7 @@ static inline __m256 mul_sum_i8_pairs_float(const __m256i x, const __m256i y) {
static inline __m128i packNibbles( __m256i bytes ) static inline __m128i packNibbles( __m256i bytes )
{ {
// Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh // Move bits within 16-bit lanes from 0000_abcd_0000_efgh into 0000_0000_abcd_efgh
#ifdef __AVX512F__ #if __AVX512F__
const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000 const __m256i bytes_srli_4 = _mm256_srli_epi16(bytes, 4); // 0000_0000_abcd_0000
bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh bytes = _mm256_or_si256(bytes, bytes_srli_4); // 0000_abcd_abcd_efgh
return _mm256_cvtepi16_epi8(bytes); // abcd_efgh return _mm256_cvtepi16_epi8(bytes); // abcd_efgh
@ -4949,13 +4949,6 @@ struct ggml_tensor * ggml_set_name(struct ggml_tensor * tensor, const char * nam
return tensor; return tensor;
} }
#ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((gnu_format(printf, 2, 3)))
#else
__attribute__((format(printf, 2, 3)))
#endif
#endif
struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) { struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * fmt, ...) {
va_list args; va_list args;
va_start(args, fmt); va_start(args, fmt);
@ -18728,14 +18721,6 @@ int ggml_cpu_has_sse3(void) {
#endif #endif
} }
int ggml_cpu_has_ssse3(void) {
#if defined(__SSSE3__)
return 1;
#else
return 0;
#endif
}
int ggml_cpu_has_vsx(void) { int ggml_cpu_has_vsx(void) {
#if defined(__POWER9_VECTOR__) #if defined(__POWER9_VECTOR__)
return 1; return 1;

1
ggml.h
View File

@ -1508,7 +1508,6 @@ extern "C" {
GGML_API int ggml_cpu_has_clblast (void); GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_gpublas (void); GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void); GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void);
GGML_API int ggml_cpu_has_vsx (void); GGML_API int ggml_cpu_has_vsx (void);
// //

View File

@ -7,7 +7,6 @@ from torch import Tensor
from torch import nn from torch import nn
from typing import Dict from typing import Dict
from typing import Optional from typing import Optional
from ane_transformers.reference.layer_norm import LayerNormANE as LayerNormANEBase
from coremltools.models.neural_network.quantization_utils import quantize_weights from coremltools.models.neural_network.quantization_utils import quantize_weights
from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions from whisper.model import Whisper, AudioEncoder, TextDecoder, ResidualAttentionBlock, MultiHeadAttention, ModelDimensions
from whisper import load_model from whisper import load_model
@ -32,12 +31,12 @@ def correct_for_bias_scale_order_inversion(state_dict, prefix, local_metadata,
state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight'] state_dict[prefix + 'bias'] = state_dict[prefix + 'bias'] / state_dict[prefix + 'weight']
return state_dict return state_dict
class LayerNormANE(LayerNormANEBase): class LayerNorm(nn.LayerNorm):
def forward(self, x: Tensor) -> Tensor:
def __init__(self, *args, **kwargs): x = x.transpose(1,3)
super().__init__(*args, **kwargs) x = super().forward(x)
self._register_load_state_dict_pre_hook( x = x.transpose(1,3)
correct_for_bias_scale_order_inversion) return x
class MultiHeadAttentionANE(MultiHeadAttention): class MultiHeadAttentionANE(MultiHeadAttention):
def __init__(self, n_state: int, n_head: int): def __init__(self, n_state: int, n_head: int):
@ -104,9 +103,9 @@ class ResidualAttentionBlockANE(ResidualAttentionBlock):
def __init__(self, n_state: int, n_head: int, cross_attention: bool = False): def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
super().__init__(n_state, n_head, cross_attention) super().__init__(n_state, n_head, cross_attention)
self.attn = MultiHeadAttentionANE(n_state, n_head) self.attn = MultiHeadAttentionANE(n_state, n_head)
self.attn_ln = LayerNormANE(n_state) self.attn_ln = LayerNorm(n_state)
self.cross_attn = MultiHeadAttentionANE(n_state, n_head) if cross_attention else None self.cross_attn = MultiHeadAttentionANE(n_state, n_head) if cross_attention else None
self.cross_attn_ln = LayerNormANE(n_state) if cross_attention else None self.cross_attn_ln = LayerNorm(n_state) if cross_attention else None
n_mlp = n_state * 4 n_mlp = n_state * 4
self.mlp = nn.Sequential( self.mlp = nn.Sequential(
@ -114,7 +113,7 @@ class ResidualAttentionBlockANE(ResidualAttentionBlock):
nn.GELU(), nn.GELU(),
nn.Conv2d(n_mlp, n_state, kernel_size=1) nn.Conv2d(n_mlp, n_state, kernel_size=1)
) )
self.mlp_ln = LayerNormANE(n_state) self.mlp_ln = LayerNorm(n_state)
class AudioEncoderANE(AudioEncoder): class AudioEncoderANE(AudioEncoder):
@ -124,7 +123,7 @@ class AudioEncoderANE(AudioEncoder):
self.blocks = nn.ModuleList( self.blocks = nn.ModuleList(
[ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)] [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
) )
self.ln_post = LayerNormANE(n_state) self.ln_post = LayerNorm(n_state)
def forward(self, x: Tensor): def forward(self, x: Tensor):
""" """
@ -168,7 +167,7 @@ class TextDecoderANE(TextDecoder):
self.blocks= nn.ModuleList( self.blocks= nn.ModuleList(
[ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)] [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
) )
self.ln= LayerNormANE(n_state) self.ln= LayerNorm(n_state)
def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None): def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
""" """

View File

@ -8,7 +8,7 @@
wd=$(dirname "$0") wd=$(dirname "$0")
cd "$wd/../" cd "$wd/../"
python3 models/convert-whisper-to-coreml.py --model tiny.en python3 models/convert-whisper-to-coreml.py --model tiny.en --optimize-ane True
mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage mv -v models/coreml-encoder-tiny.en.mlpackage models/whisper-encoder-impl.mlpackage
xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/ xcrun coremlc generate models/whisper-encoder-impl.mlpackage coreml/

View File

@ -13,7 +13,7 @@ mname="$1"
wd=$(dirname "$0") wd=$(dirname "$0")
cd "$wd/../" cd "$wd/../"
python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True python3 models/convert-whisper-to-coreml.py --model $mname --encoder-only True --optimize-ane True
xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/ xcrun coremlc compile models/coreml-encoder-${mname}.mlpackage models/
rm -rf models/ggml-${mname}-encoder.mlmodelc rm -rf models/ggml-${mname}-encoder.mlmodelc

File diff suppressed because it is too large Load Diff

View File

@ -67,7 +67,6 @@ extern "C" {
struct whisper_context; struct whisper_context;
struct whisper_state; struct whisper_state;
struct whisper_full_params;
typedef int whisper_token; typedef int whisper_token;
@ -346,7 +345,7 @@ extern "C" {
void * user_data); void * user_data);
// Parameters for the whisper_full() function // Parameters for the whisper_full() function
// If you change the order or add new parameters, make sure to update the default values in whisper.cpp: // If you chnage the order or add new parameters, make sure to update the default values in whisper.cpp:
// whisper_full_default_params() // whisper_full_default_params()
struct whisper_full_params { struct whisper_full_params {
enum whisper_sampling_strategy strategy; enum whisper_sampling_strategy strategy;
@ -375,7 +374,6 @@ extern "C" {
// [EXPERIMENTAL] speed-up techniques // [EXPERIMENTAL] speed-up techniques
// note: these can significantly reduce the quality of the output // note: these can significantly reduce the quality of the output
bool speed_up; // speed-up the audio by 2x using Phase Vocoder bool speed_up; // speed-up the audio by 2x using Phase Vocoder
bool debug_mode; // enable debug_mode provides extra info (eg. Dump log_mel)
int audio_ctx; // overwrite the audio context size (0 = use default) int audio_ctx; // overwrite the audio context size (0 = use default)
// [EXPERIMENTAL] [TDRZ] tinydiarize // [EXPERIMENTAL] [TDRZ] tinydiarize
@ -519,11 +517,6 @@ extern "C" {
WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads); WHISPER_API int whisper_bench_ggml_mul_mat (int n_threads);
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads); WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads);
// Control logging output; default behavior is to print to stderr
typedef void (*whisper_log_callback)(const char * line);
WHISPER_API void whisper_set_log_callback(whisper_log_callback callback);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif