Compare commits

..

24 Commits

Author SHA1 Message Date
fa8dbdc888 release : v1.4.0 2023-04-30 19:23:37 +03:00
4a7d49af95 examples : fix + refactor Levenshtein distance 2023-04-30 19:12:49 +03:00
794b162a46 whisper : add integer quantization support (#540)
* whisper : add integer quantization support

* examples : add common-ggml + prepare to add "quantize" tool

* whisper : quantization tool ready

* whisper : fix F32 support

* whisper : try to fix shared lib linkage

* wasm : update quantized models to Q5

* bench.wasm : remove "medium" button

* bench.wasm : fix custom model button

* ggml : add Q5_0 and Q5_1 WASM SIMD

* wasm : add quantized models to all WASM examples

* wasm : bump DB version number to 2

* talk-llama : update example to latest llama.cpp

* node : increase test timeout to 10s

* readme : add information for model quantization

* wasm : add links to other examples
2023-04-30 18:51:57 +03:00
5fd1bdd7fc whisper : add GPU support via cuBLAS (#834)
* make : add WHISPER_CUBLAS

* make : fix CUBLAS build

* whisper : disable Flash Attention + adjust memory buffers

* whisper : remove old commented code

* readme : add cuBLAS instructions

* cmake : add WHISPER_CUBLAS option

* gitignore : ignore build-cublas
2023-04-30 12:14:33 +03:00
0ccd6746c9 ggml : fix WASM build 2023-04-29 21:37:23 +03:00
d9b550c0a1 ggml : fix 32-bit ARM NEON (#836)
* ggml : add support for 32-bit ARM

* ggml : fix

* ggml : fix
2023-04-29 21:33:33 +03:00
e9b091c92a ggml : use vzip instead of vuzp for consistency 2023-04-29 21:14:09 +03:00
1f30b99208 ggml : fix WASM build 2023-04-29 20:21:25 +03:00
05c3ea3bc8 ggml : sync with ggml repo (warning fixes + asserts) 2023-04-29 19:33:28 +03:00
6108d3cc58 whisper : use correct seek_end when offset is used (#833)
Whenever an `offset_ms` is provided, the value of `seek_end` is
calculated incorrectly. This causes Whisper to keep transcribing
after the end of the file.

The current behavior looks like
```
[00:34:40.000 --> 00:34:47.000]   This is an example audio file.
[00:34:47.000 --> 00:34:49.000]   The text has been redacted
[00:34:49.000 --> 00:34:51.000]   This is the end of the audio.
[00:34:51.000 --> 00:34:52.000]   ***
[00:34:52.000 --> 00:34:53.000]   ***
[00:34:53.000 --> 00:34:54.000]   ***
[00:34:55.000 --> 00:34:56.000]   ***
...
```

The expected behavior should be
```
[00:34:40.000 --> 00:34:47.000]   This is an example audio file.
[00:34:47.000 --> 00:34:49.000]   The text has been redacted
[00:34:49.000 --> 00:34:51.000]   This is the end of the audio.
- end of program -
```

This commit changes the calculation of the `seek_end` variable to
only add `seek_start` if a custom `duration_ms` is provided.
Otherwise, it defaults to the end of the file.

Signed-off-by: Thijs Raymakers <thijs@raymakers.nl>
2023-04-29 18:55:37 +03:00
bab97c83d0 tests : add "threads" to run-tests.sh 2023-04-29 12:32:28 +03:00
3eaeb030ff extra : add sync-ggml.sh script 2023-04-29 12:32:28 +03:00
acec73ab6e ggml : sync latest ggml + llama.cpp updates (quantization) 2023-04-29 12:32:28 +03:00
5cc17418c7 whisper.android : add some tips (#816) 2023-04-29 11:00:20 +03:00
3efb81dec6 build : add WHISPER_COREML_ALLOW_FALLBACK to make / CMake (#812) 2023-04-29 10:55:24 +03:00
94a7cd2a07 whisper : allow non-CoreML fallback when Core ML cannot be loaded (#812)
if the Core ML model cannot be loaded, continue without Core ML instead of
returning. This allows a single build to transcribe using Core ML models
where available, and regular models when not.
2023-04-29 10:49:02 +03:00
3e82ff4747 whisper : fix bug from previous commit 2023-04-29 10:42:14 +03:00
b5bd2f43c5 whisper : avoid designated initializers 2023-04-29 10:36:50 +03:00
94aa56f19e minor : improve C++ and Python style (#768)
* use some STL functions

* use self.field than setattr, use pathlib.Path

* recover some format

* const some iter

* Keep the original

* 2 space
2023-04-29 10:06:25 +03:00
4d89ee2e59 readme : add logo 2023-04-28 22:41:29 +03:00
70567eff23 main : escape quotes in csv output (#815) 2023-04-23 19:01:59 +03:00
02ec83c5d5 stream : flush upon finishing inference (#811) 2023-04-23 17:00:30 +03:00
2bd4b8d577 examples : add missing #include <cstdint> (#798)
common.cpp uses uint8_t and uint64_t, which are defined in <cstdint>.
2023-04-23 16:52:52 +03:00
eecf2c3d41 main : update escape_double_quotes() function (#776)
Updated the escape_double_quotes() function such that the function now escapes both double quotes and backslashes in the input string.

Changes Made:

- Renamed the function to escape_quotes_and_backslashes

- Modified the condition in the first loop to increment the value of 'escaped_length' for both double quotes and backslashes.

- Modified the condition in second loop to add a backslash before the current character if it is a double quote or a backslash.

Resolves: #769
2023-04-23 16:47:30 +03:00
56 changed files with 7756 additions and 3115 deletions

2
.gitignore vendored
View File

@ -12,6 +12,7 @@ build-em/
build-debug/ build-debug/
build-release/ build-release/
build-static/ build-static/
build-cublas/
build-no-accel/ build-no-accel/
build-sanitize-addr/ build-sanitize-addr/
build-sanitize-thread/ build-sanitize-thread/
@ -22,6 +23,7 @@ build-sanitize-thread/
/talk /talk
/talk-llama /talk-llama
/bench /bench
/quantize
arm_neon.h arm_neon.h
sync.sh sync.sh

View File

@ -1,6 +1,6 @@
cmake_minimum_required (VERSION 3.0) cmake_minimum_required (VERSION 3.0)
project(whisper.cpp VERSION 1.3.0) project(whisper.cpp VERSION 1.4.0)
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")
add_compile_options(/utf-8) add_compile_options(/utf-8)
@ -51,7 +51,7 @@ option(WHISPER_SANITIZE_UNDEFINED "whisper: enable undefined sanitizer" OFF
option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE}) option(WHISPER_BUILD_TESTS "whisper: build tests" ${WHISPER_STANDALONE})
option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE}) option(WHISPER_BUILD_EXAMPLES "whisper: build examples" ${WHISPER_STANDALONE})
option(WHISPER_SUPPORT_SDL2 "whisper: support for libSDL2" OFF) option(WHISPER_SDL2 "whisper: support for libSDL2" OFF)
if (APPLE) if (APPLE)
option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF) option(WHISPER_NO_ACCELERATE "whisper: disable Accelerate framework" OFF)
@ -60,8 +60,10 @@ if (APPLE)
option(WHISPER_NO_FMA "whisper: disable FMA" OFF) option(WHISPER_NO_FMA "whisper: disable FMA" OFF)
option(WHISPER_COREML "whisper: enable Core ML framework" OFF) option(WHISPER_COREML "whisper: enable Core ML framework" OFF)
option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
else() else()
option(WHISPER_SUPPORT_OPENBLAS "whisper: support for OpenBLAS" OFF) option(WHISPER_OPENBLAS "whisper: support for OpenBLAS" OFF)
option(WHISPER_CUBLAS "whisper: support for cuBLAS" OFF)
endif() endif()
option(WHISPER_PERF "whisper: enable perf timings" OFF) option(WHISPER_PERF "whisper: enable perf timings" OFF)
@ -119,10 +121,14 @@ if (APPLE)
else() else()
message(WARNING "CoreML framework not found") message(WARNING "CoreML framework not found")
endif() endif()
if (WHISPER_COREML_ALLOW_FALLBACK)
set(WHISPER_EXTRA_FLAGS ${WHISPER_EXTRA_FLAGS} -DWHISPER_USE_COREML_ALLOW_FALLBACK)
endif()
endif() endif()
endif() endif()
if (WHISPER_SUPPORT_OPENBLAS) if (WHISPER_OPENBLAS)
find_library(OPENBLAS_LIB find_library(OPENBLAS_LIB
NAMES openblas libopenblas NAMES openblas libopenblas
) )
@ -136,6 +142,31 @@ if (WHISPER_SUPPORT_OPENBLAS)
endif() endif()
endif() endif()
if (WHISPER_CUBLAS)
cmake_minimum_required(VERSION 3.17)
find_package(CUDAToolkit)
if (CUDAToolkit_FOUND)
message(STATUS "cuBLAS found")
enable_language(CUDA)
set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
add_compile_definitions(GGML_USE_CUBLAS)
if (WHISPER_STATIC)
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
else()
set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
else()
message(WARNING "cuBLAS not found")
endif()
endif()
# compiler flags # compiler flags
if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@ -242,6 +273,7 @@ set(TARGET whisper)
add_library(${TARGET} add_library(${TARGET}
ggml.h ggml.h
ggml.c ggml.c
${GGML_CUDA_SOURCES}
whisper.h whisper.h
whisper.cpp whisper.cpp
) )
@ -271,7 +303,19 @@ if (BUILD_SHARED_LIBS)
target_compile_definitions(${TARGET} PUBLIC target_compile_definitions(${TARGET} PUBLIC
WHISPER_SHARED WHISPER_SHARED
GGML_SHARED
) )
target_compile_definitions(${TARGET} PRIVATE
WHISPER_BUILD
GGML_BUILD
)
endif()
if (GGML_CUDA_SOURCES)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
endif() endif()
if (EMSCRIPTEN) if (EMSCRIPTEN)

View File

@ -1,3 +1,5 @@
default: main bench quantize
ifndef UNAME_S ifndef UNAME_S
UNAME_S := $(shell uname -s) UNAME_S := $(shell uname -s)
endif endif
@ -123,6 +125,7 @@ endif
ifeq ($(UNAME_M),amd64) ifeq ($(UNAME_M),amd64)
CFLAGS += -mavx -mavx2 -mfma -mf16c CFLAGS += -mavx -mavx2 -mfma -mf16c
endif endif
ifneq ($(filter ppc64%,$(UNAME_M)),) ifneq ($(filter ppc64%,$(UNAME_M)),)
POWER9_M := $(shell grep "POWER9" /proc/cpuinfo) POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
ifneq (,$(findstring POWER9,$(POWER9_M))) ifneq (,$(findstring POWER9,$(POWER9_M)))
@ -133,6 +136,7 @@ ifneq ($(filter ppc64%,$(UNAME_M)),)
CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
endif endif
endif endif
ifndef WHISPER_NO_ACCELERATE ifndef WHISPER_NO_ACCELERATE
# Mac M1 - include Accelerate framework # Mac M1 - include Accelerate framework
ifeq ($(UNAME_S),Darwin) ifeq ($(UNAME_S),Darwin)
@ -140,26 +144,48 @@ ifndef WHISPER_NO_ACCELERATE
LDFLAGS += -framework Accelerate LDFLAGS += -framework Accelerate
endif endif
endif endif
ifdef WHISPER_COREML ifdef WHISPER_COREML
CXXFLAGS += -DWHISPER_USE_COREML CXXFLAGS += -DWHISPER_USE_COREML
LDFLAGS += -framework Foundation -framework CoreML LDFLAGS += -framework Foundation -framework CoreML
ifdef WHISPER_COREML_ALLOW_FALLBACK
CXXFLAGS += -DWHISPER_COREML_ALLOW_FALLBACK
endif endif
endif
ifdef WHISPER_OPENBLAS ifdef WHISPER_OPENBLAS
CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas CFLAGS += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
LDFLAGS += -lopenblas LDFLAGS += -lopenblas
endif endif
ifdef WHISPER_CUBLAS
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
WHISPER_OBJ += ggml-cuda.o
NVCC = nvcc
NVCCFLAGS = --forward-unknown-to-host-compiler -arch=native
ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
$(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
endif
ifdef WHISPER_GPROF ifdef WHISPER_GPROF
CFLAGS += -pg CFLAGS += -pg
CXXFLAGS += -pg CXXFLAGS += -pg
endif endif
ifneq ($(filter aarch64%,$(UNAME_M)),) ifneq ($(filter aarch64%,$(UNAME_M)),)
CFLAGS += -mcpu=native CFLAGS += -mcpu=native
CXXFLAGS += -mcpu=native CXXFLAGS += -mcpu=native
endif endif
ifneq ($(filter armv6%,$(UNAME_M)),) ifneq ($(filter armv6%,$(UNAME_M)),)
# 32-bit Raspberry Pi 1, 2, 3 # 32-bit Raspberry Pi 1, 2, 3
CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access
endif endif
ifneq ($(filter armv7%,$(UNAME_M)),) ifneq ($(filter armv7%,$(UNAME_M)),)
# 32-bit ARM, for example on Armbian or possibly raspbian # 32-bit ARM, for example on Armbian or possibly raspbian
CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations CFLAGS += -mfpu=neon -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
@ -167,6 +193,7 @@ ifneq ($(filter armv7%,$(UNAME_M)),)
# 64-bit ARM, use these (TODO: auto-detect 64-bit) # 64-bit ARM, use these (TODO: auto-detect 64-bit)
# CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations # CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
endif endif
ifneq ($(filter armv8%,$(UNAME_M)),) ifneq ($(filter armv8%,$(UNAME_M)),)
# Raspberry Pi 4 # Raspberry Pi 4
CFLAGS += -mfp16-format=ieee -mno-unaligned-access CFLAGS += -mfp16-format=ieee -mno-unaligned-access
@ -187,20 +214,18 @@ $(info I CC: $(CCV))
$(info I CXX: $(CXXV)) $(info I CXX: $(CXXV))
$(info ) $(info )
default: main bench
# #
# Build library # Build library
# #
ggml.o: ggml.c ggml.h ggml.o: ggml.c ggml.h ggml-cuda.h
$(CC) $(CFLAGS) -c ggml.c -o ggml.o $(CC) $(CFLAGS) -c $< -o $@
whisper.o: whisper.cpp whisper.h ggml.h whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
$(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o $(CXX) $(CXXFLAGS) -c $< -o $@
ifndef WHISPER_COREML ifndef WHISPER_COREML
WHISPER_OBJ = whisper.o WHISPER_OBJ += whisper.o
else else
whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
$(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o $(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
@ -208,7 +233,7 @@ whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
$(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o $(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o WHISPER_OBJ += whisper.o whisper-encoder.o whisper-encoder-impl.o
endif endif
libwhisper.a: ggml.o $(WHISPER_OBJ) libwhisper.a: ggml.o $(WHISPER_OBJ)
@ -218,7 +243,7 @@ libwhisper.so: ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS) $(CXX) $(CXXFLAGS) -shared -o libwhisper.so ggml.o $(WHISPER_OBJ) $(LDFLAGS)
clean: clean:
rm -f *.o main stream command talk talk-llama bench libwhisper.a libwhisper.so rm -f *.o main stream command talk talk-llama bench quantize libwhisper.a libwhisper.so
# #
# Examples # Examples
@ -226,7 +251,7 @@ clean:
CC_SDL=`sdl2-config --cflags --libs` CC_SDL=`sdl2-config --cflags --libs`
SRC_COMMON = examples/common.cpp SRC_COMMON = examples/common.cpp examples/common-ggml.cpp
SRC_COMMON_SDL = examples/common-sdl.cpp SRC_COMMON_SDL = examples/common-sdl.cpp
main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
@ -236,6 +261,9 @@ main: examples/main/main.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ)
bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) bench: examples/bench/bench.cpp ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/bench/bench.cpp ggml.o $(WHISPER_OBJ) -o bench $(LDFLAGS)
quantize: examples/quantize/quantize.cpp ggml.o $(WHISPER_OBJ) $(SRC_COMMON)
$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp $(SRC_COMMON) ggml.o $(WHISPER_OBJ) -o quantize $(LDFLAGS)
stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) stream: examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ)
$(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS) $(CXX) $(CXXFLAGS) examples/stream/stream.cpp $(SRC_COMMON) $(SRC_COMMON_SDL) ggml.o $(WHISPER_OBJ) -o stream $(CC_SDL) $(LDFLAGS)

View File

@ -1,10 +1,12 @@
# whisper.cpp # whisper.cpp
![whisper.cpp](https://user-images.githubusercontent.com/1991296/235238348-05d0f6a4-da44-4900-a1de-d0707e75b763.jpeg)
[![Actions Status](https://github.com/ggerganov/whisper.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/whisper.cpp/actions) [![Actions Status](https://github.com/ggerganov/whisper.cpp/workflows/CI/badge.svg)](https://github.com/ggerganov/whisper.cpp/actions)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/) [![npm](https://img.shields.io/npm/v/whisper.cpp.svg)](https://www.npmjs.com/package/whisper.cpp/)
Beta: [v1.3.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.3.0) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126) Beta: [v1.4.0](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.4.0) / Stable: [v1.2.1](https://github.com/ggerganov/whisper.cpp/releases/tag/v1.2.1) / [Roadmap | F.A.Q.](https://github.com/ggerganov/whisper.cpp/discussions/126)
High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model: High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisper) automatic speech recognition (ASR) model:
@ -13,9 +15,11 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
- AVX intrinsics support for x86 architectures - AVX intrinsics support for x86 architectures
- VSX intrinsics support for POWER architectures - VSX intrinsics support for POWER architectures
- Mixed F16 / F32 precision - Mixed F16 / F32 precision
- [4-bit and 5-bit integer quantization support](https://github.com/ggerganov/whisper.cpp#quantization)
- Low memory usage (Flash Attention) - Low memory usage (Flash Attention)
- Zero memory allocations at runtime - Zero memory allocations at runtime
- Runs on the CPU - Runs on the CPU
- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
- [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h) - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
Supported platforms: Supported platforms:
@ -225,6 +229,22 @@ make large
| medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` | | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
| large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` | | large | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
## Quantization
`whisper.cpp` supports integer quantization of the Whisper `ggml` models.
Quantized models require less memory and disk space and depending on the hardware can be processed more efficiently.
Here are the steps for creating and using a quantized model:
```bash
# quantize a model with Q5_0 method
make quantize
./quantize models/ggml-base.en.bin models/ggml-base.en-q5_0.bin q5_0
# run the examples as usual, specifying the quantized model file
./main -m models/ggml-base.en-q5_0.bin ./samples/gb0.wav
```
## Core ML support ## Core ML support
On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant On Apple Silicon devices, the Encoder inference can be executed on the Apple Neural Engine (ANE) via Core ML. This can result in significant
@ -279,10 +299,23 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566). For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
## NVIDIA GPU support via cuBLAS
With NVIDIA cards, the Encoder processing can be offloaded to the GPU to a large extend through cuBLAS.
First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
Now build `whisper.cpp` with cuBLAS support:
```
make clean
WHISPER_CUBLAS=1 make -j
```
Run all the examples as usual.
## Limitations ## Limitations
- Inference only - Inference only
- No GPU support (yet)
## Another example ## Another example

View File

@ -1,6 +1,6 @@
{ {
"name": "whisper.cpp", "name": "whisper.cpp",
"version": "1.3.0", "version": "1.4.0",
"description": "Whisper speech recognition", "description": "Whisper speech recognition",
"main": "whisper.js", "main": "whisper.js",
"scripts": { "scripts": {

File diff suppressed because one or more lines are too long

View File

@ -4,7 +4,7 @@ find_package(Threads REQUIRED)
# third-party # third-party
if (WHISPER_SUPPORT_SDL2) if (WHISPER_SDL2)
# SDL2 # SDL2
find_package(SDL2 REQUIRED) find_package(SDL2 REQUIRED)
@ -21,13 +21,17 @@ set(TARGET common)
add_library(${TARGET} STATIC add_library(${TARGET} STATIC
common.h common.h
common.cpp common.cpp
common-ggml.h
common-ggml.cpp
) )
include(DefaultTargetOptions) include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE whisper)
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON) set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
if (WHISPER_SUPPORT_SDL2) if (WHISPER_SDL2)
# common-sdl # common-sdl
set(TARGET common-sdl) set(TARGET common-sdl)
@ -62,6 +66,7 @@ else()
add_subdirectory(stream) add_subdirectory(stream)
add_subdirectory(command) add_subdirectory(command)
add_subdirectory(bench) add_subdirectory(bench)
add_subdirectory(quantize)
add_subdirectory(talk) add_subdirectory(talk)
add_subdirectory(talk-llama) add_subdirectory(talk-llama)
endif() endif()

View File

@ -18,5 +18,6 @@ describe("Run whisper.node", () => {
let result = await whisperAsync(whisperParamsMock); let result = await whisperAsync(whisperParamsMock);
expect(result.length).toBeGreaterThan(0); expect(result.length).toBeGreaterThan(0);
}, 10000);
}); });
});

View File

@ -31,9 +31,9 @@ endif()
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \ --bind \
-s USE_PTHREADS=1 \ -s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \ -s PTHREAD_POOL_SIZE_STRICT=0 \
-s INITIAL_MEMORY=1024MB \ -s INITIAL_MEMORY=2000MB \
-s TOTAL_MEMORY=1024MB \ -s TOTAL_MEMORY=2000MB \
-s FORCE_FILESYSTEM=1 \ -s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \ -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \ ${EXTRA_FLAGS} \

View File

@ -35,6 +35,15 @@
<br><br> <br><br>
<b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |
<br><br>
<hr> <hr>
Select the model you would like to use and click the "Bench" button.<br> Select the model you would like to use and click the "Bench" button.<br>
@ -46,9 +55,16 @@
Whisper model: <span id="model-whisper-status"></span> Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button> <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button> <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<span id="fetch-whisper-progress"></span> <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" /> <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<button id="fetch-whisper-small-en-q5_1" onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
<button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
<button id="fetch-whisper-large-q5_0" onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
<span id="fetch-whisper-progress"></span>
</div> </div>
<br> <br>
@ -160,6 +176,14 @@
document.getElementById('fetch-whisper-tiny-en').style.display = 'none'; document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none'; document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-small-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
document.getElementById('whisper-file' ).style.display = 'none'; document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name; document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
} }
@ -168,11 +192,25 @@
let urls = { let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin', 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin', 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
}; };
let sizes = { let sizes = {
'tiny.en': 75, 'tiny.en': 75,
'base.en': 142, 'base.en': 142,
'small.en': 466,
'tiny-en-q5_1': 31,
'base-en-q5_1': 57,
'small-en-q5_1': 182,
'medium-en-q5_0': 515,
'large-q5_0': 1030,
}; };
let url = urls[model]; let url = urls[model];
@ -181,6 +219,15 @@
document.getElementById('fetch-whisper-tiny-en').style.display = 'none'; document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none'; document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-small-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... '; document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
cbProgress = function(p) { cbProgress = function(p) {
@ -192,6 +239,15 @@
var el; var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-large-q5_0' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = ''; el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
}; };

View File

@ -28,31 +28,6 @@ std::string g_transcribed = "";
std::vector<float> g_pcmf32; std::vector<float> g_pcmf32;
// compute similarity between two strings using Levenshtein distance
static float similarity(const std::string & s0, const std::string & s1) {
const size_t len0 = s0.size() + 1;
const size_t len1 = s1.size() + 1;
std::vector<int> col(len1, 0);
std::vector<int> prevCol(len1, 0);
for (size_t i = 0; i < len1; i++) {
prevCol[i] = i;
}
for (size_t i = 0; i < len0; i++) {
col[0] = i;
for (size_t j = 1; j < len1; j++) {
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
}
col.swap(prevCol);
}
const float dist = prevCol[len1 - 1];
return 1.0f - (dist / std::max(s0.size(), s1.size()));
}
void command_set_status(const std::string & status) { void command_set_status(const std::string & status) {
std::lock_guard<std::mutex> lock(g_mutex); std::lock_guard<std::mutex> lock(g_mutex);
g_status = status; g_status = status;

View File

@ -35,6 +35,15 @@
<br><br> <br><br>
<b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |
<br><br>
<hr> <hr>
Select the model you would like to use, click the "Start" button and follow the instructions. Select the model you would like to use, click the "Start" button and follow the instructions.
@ -45,6 +54,10 @@
Whisper model: <span id="model-whisper-status"></span> Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button> <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button> <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<span id="fetch-whisper-progress"></span> <span id="fetch-whisper-progress"></span>
<!-- <!--
@ -162,11 +175,17 @@
let urls = { let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin', 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin', 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
}; };
let sizes = { let sizes = {
'tiny.en': 75, 'tiny.en': 75,
'base.en': 142, 'base.en': 142,
'tiny-en-q5_1': 31,
'base-en-q5_1': 57,
}; };
let url = urls[model]; let url = urls[model];
@ -177,6 +196,10 @@
document.getElementById('fetch-whisper-tiny-en').style.display = 'none'; document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none'; document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... '; document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
cbProgress = function(p) { cbProgress = function(p) {
@ -188,6 +211,10 @@
var el; var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = ''; el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
}; };

View File

@ -1,4 +1,4 @@
if (WHISPER_SUPPORT_SDL2) if (WHISPER_SDL2)
# command # command
set(TARGET command) set(TARGET command)
add_executable(${TARGET} command.cpp) add_executable(${TARGET} command.cpp)

View File

@ -163,31 +163,6 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
return result; return result;
} }
// compute similarity between two strings using Levenshtein distance
float similarity(const std::string & s0, const std::string & s1) {
const size_t len0 = s0.size() + 1;
const size_t len1 = s1.size() + 1;
std::vector<int> col(len1, 0);
std::vector<int> prevCol(len1, 0);
for (size_t i = 0; i < len1; i++) {
prevCol[i] = i;
}
for (size_t i = 0; i < len0; i++) {
col[0] = i;
for (size_t j = 1; j < len1; j++) {
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (s0[i - 1] == s1[j - 1] ? 0 : 1));
}
col.swap(prevCol);
}
const float dist = prevCol[len1 - 1];
return 1.0f - (dist / std::max(s0.size(), s1.size()));
}
std::vector<std::string> read_allowed_commands(const std::string & fname) { std::vector<std::string> read_allowed_commands(const std::string & fname) {
std::vector<std::string> allowed_commands; std::vector<std::string> allowed_commands;

241
examples/common-ggml.cpp Normal file
View File

@ -0,0 +1,241 @@
#include "common-ggml.h"
#include <regex>
#include <map>
static const std::map<std::string, enum ggml_ftype> GGML_FTYPE_MAP = {
{"q4_0", GGML_FTYPE_MOSTLY_Q4_0},
{"q4_1", GGML_FTYPE_MOSTLY_Q4_1},
{"q4_2", GGML_FTYPE_MOSTLY_Q4_2},
{"q5_0", GGML_FTYPE_MOSTLY_Q5_0},
{"q5_1", GGML_FTYPE_MOSTLY_Q5_1},
{"q8_0", GGML_FTYPE_MOSTLY_Q8_0},
};
void ggml_print_ftypes(FILE * fp) {
for (auto it = GGML_FTYPE_MAP.begin(); it != GGML_FTYPE_MAP.end(); it++) {
fprintf(fp, " type = \"%s\" or %d\n", it->first.c_str(), it->second);
}
}
enum ggml_ftype ggml_parse_ftype(const char * str) {
enum ggml_ftype ftype;
if (str[0] == 'q') {
const auto it = GGML_FTYPE_MAP.find(str);
if (it == GGML_FTYPE_MAP.end()) {
fprintf(stderr, "%s: unknown ftype '%s'\n", __func__, str);
return GGML_FTYPE_UNKNOWN;
}
ftype = it->second;
} else {
ftype = (enum ggml_ftype) atoi(str);
}
return ftype;
}
bool ggml_common_quantize_0(
std::ifstream & finp,
std::ofstream & fout,
const ggml_ftype ftype,
const std::vector<std::string> & to_quant,
const std::vector<std::string> & to_skip) {
ggml_type qtype = GGML_TYPE_F32;
switch (ftype) {
case GGML_FTYPE_MOSTLY_Q4_0: qtype = GGML_TYPE_Q4_0; break;
case GGML_FTYPE_MOSTLY_Q4_1: qtype = GGML_TYPE_Q4_1; break;
case GGML_FTYPE_MOSTLY_Q4_2: qtype = GGML_TYPE_Q4_2; break;
case GGML_FTYPE_MOSTLY_Q5_0: qtype = GGML_TYPE_Q5_0; break;
case GGML_FTYPE_MOSTLY_Q5_1: qtype = GGML_TYPE_Q5_1; break;
case GGML_FTYPE_MOSTLY_Q8_0: qtype = GGML_TYPE_Q8_0; break;
case GGML_FTYPE_UNKNOWN:
case GGML_FTYPE_ALL_F32:
case GGML_FTYPE_MOSTLY_F16:
case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16:
{
fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype);
return false;
}
};
if (!ggml_is_quantized(qtype)) {
fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
return false;
}
size_t total_size_org = 0;
size_t total_size_new = 0;
std::vector<float> work;
std::vector<uint8_t> data_u8;
std::vector<ggml_fp16_t> data_f16;
std::vector<float> data_f32;
std::vector<int64_t> hist_all(1 << 4, 0);
while (true) {
int32_t n_dims;
int32_t length;
int32_t ttype;
finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
finp.read(reinterpret_cast<char *>(&length), sizeof(length));
finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (finp.eof()) {
break;
}
int32_t nelements = 1;
int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
std::string name(length, 0);
finp.read (&name[0], length);
printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype));
bool quantize = false;
// check if we should quantize this tensor
for (const auto & s : to_quant) {
if (std::regex_match(name, std::regex(s))) {
quantize = true;
break;
}
}
// check if we should skip this tensor
for (const auto & s : to_skip) {
if (std::regex_match(name, std::regex(s))) {
quantize = false;
break;
}
}
// quantize only 2D tensors
quantize &= (n_dims == 2);
if (quantize) {
if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
return false;
}
if (ttype == GGML_TYPE_F16) {
data_f16.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
data_f32.resize(nelements);
for (int i = 0; i < nelements; ++i) {
data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
}
} else {
data_f32.resize(nelements);
finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
}
ttype = qtype;
} else {
const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
data_u8.resize(nelements*bpe);
finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
}
fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fout.write(reinterpret_cast<char *>(&length), sizeof(length));
fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
for (int i = 0; i < n_dims; ++i) {
fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
}
fout.write(&name[0], length);
if (quantize) {
work.resize(nelements); // for quantization
size_t cur_size = 0;
std::vector<int64_t> hist_cur(1 << 4, 0);
switch ((ggml_type) ttype) {
case GGML_TYPE_Q4_0:
{
cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_1:
{
cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q4_2:
{
cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_0:
{
cur_size = ggml_quantize_q5_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q5_1:
{
cur_size = ggml_quantize_q5_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_Q8_0:
{
cur_size = ggml_quantize_q8_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
} break;
case GGML_TYPE_F32:
case GGML_TYPE_F16:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_Q8_1:
case GGML_TYPE_COUNT:
{
fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
return false;
}
}
fout.write(reinterpret_cast<char *>(work.data()), cur_size);
total_size_new += cur_size;
printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
for (int i = 0; i < hist_cur.size(); ++i) {
hist_all[i] += hist_cur[i];
}
for (int i = 0; i < hist_cur.size(); ++i) {
printf("%5.3f ", hist_cur[i] / (float)nelements);
}
printf("\n");
} else {
printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
total_size_new += data_u8.size();
}
total_size_org += nelements * sizeof(float);
}
printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
printf("%s: quant size = %8.2f MB | ftype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, ftype, ggml_type_name(qtype));
{
int64_t sum_all = 0;
for (int i = 0; i < hist_all.size(); ++i) {
sum_all += hist_all[i];
}
printf("%s: hist: ", __func__);
for (int i = 0; i < hist_all.size(); ++i) {
printf("%5.3f ", hist_all[i] / (float)sum_all);
}
printf("\n");
}
return true;
}

18
examples/common-ggml.h Normal file
View File

@ -0,0 +1,18 @@
#pragma once
#include "ggml.h"
#include <fstream>
#include <vector>
#include <string>
enum ggml_ftype ggml_parse_ftype(const char * str);
void ggml_print_ftypes(FILE * fp = stderr);
bool ggml_common_quantize_0(
std::ifstream & finp,
std::ofstream & fout,
const ggml_ftype ftype,
const std::vector<std::string> & to_quant,
const std::vector<std::string> & to_skip);

View File

@ -6,12 +6,86 @@
#include "dr_wav.h" #include "dr_wav.h"
#include <cmath> #include <cmath>
#include <fstream>
#include <regex> #include <regex>
#ifndef M_PI #ifndef M_PI
#define M_PI 3.14159265358979323846 #define M_PI 3.14159265358979323846
#endif #endif
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
for (int i = 1; i < argc; i++) {
std::string arg = argv[i];
if (arg == "-s" || arg == "--seed") {
params.seed = std::stoi(argv[++i]);
} else if (arg == "-t" || arg == "--threads") {
params.n_threads = std::stoi(argv[++i]);
} else if (arg == "-p" || arg == "--prompt") {
params.prompt = argv[++i];
} else if (arg == "-n" || arg == "--n_predict") {
params.n_predict = std::stoi(argv[++i]);
} else if (arg == "--top_k") {
params.top_k = std::stoi(argv[++i]);
} else if (arg == "--top_p") {
params.top_p = std::stof(argv[++i]);
} else if (arg == "--temp") {
params.temp = std::stof(argv[++i]);
} else if (arg == "-b" || arg == "--batch_size") {
params.n_batch = std::stoi(argv[++i]);
} else if (arg == "-m" || arg == "--model") {
params.model = argv[++i];
} else if (arg == "-h" || arg == "--help") {
gpt_print_usage(argc, argv, params);
exit(0);
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
gpt_print_usage(argc, argv, params);
exit(0);
}
}
return true;
}
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, "usage: %s [options]\n", argv[0]);
fprintf(stderr, "\n");
fprintf(stderr, "options:\n");
fprintf(stderr, " -h, --help show this help message and exit\n");
fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1)\n");
fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
fprintf(stderr, " -p PROMPT, --prompt PROMPT\n");
fprintf(stderr, " prompt to start generation with (default: random)\n");
fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d)\n", params.n_predict);
fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k);
fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p);
fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp);
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
}
std::string gpt_random_prompt(std::mt19937 & rng) {
const int r = rng() % 10;
switch (r) {
case 0: return "So";
case 1: return "Once upon a time";
case 2: return "When";
case 3: return "The";
case 4: return "After";
case 5: return "If";
case 6: return "import";
case 7: return "He";
case 8: return "She";
case 9: return "They";
default: return "To";
}
return "The";
}
std::string trim(const std::string & s) { std::string trim(const std::string & s) {
std::regex e("^\\s+|\\s+$"); std::regex e("^\\s+|\\s+$");
return std::regex_replace(s, e, ""); return std::regex_replace(s, e, "");
@ -27,6 +101,251 @@ std::string replace(const std::string & s, const std::string & from, const std::
return result; return result;
} }
std::map<std::string, int32_t> json_parse(const std::string & fname) {
std::map<std::string, int32_t> result;
// read file into string
std::string json;
{
std::ifstream ifs(fname);
if (!ifs) {
fprintf(stderr, "Failed to open %s\n", fname.c_str());
exit(1);
}
json = std::string((std::istreambuf_iterator<char>(ifs)),
(std::istreambuf_iterator<char>()));
}
if (json[0] != '{') {
return result;
}
// parse json
{
bool has_key = false;
bool in_token = false;
std::string str_key = "";
std::string str_val = "";
int n = json.size();
for (int i = 1; i < n; ++i) {
if (!in_token) {
if (json[i] == ' ') continue;
if (json[i] == '"') {
in_token = true;
continue;
}
} else {
if (json[i] == '\\' && i+1 < n) {
if (has_key == false) {
str_key += json[i];
} else {
str_val += json[i];
}
++i;
} else if (json[i] == '"') {
if (has_key == false) {
has_key = true;
++i;
while (json[i] == ' ') ++i;
++i; // :
while (json[i] == ' ') ++i;
if (json[i] != '\"') {
while (json[i] != ',' && json[i] != '}') {
str_val += json[i++];
}
has_key = false;
} else {
in_token = true;
continue;
}
} else {
has_key = false;
}
str_key = ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
str_key = ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
str_key = ::replace(str_key, "\\\"", "\""); // \\\" -> "
try {
result[str_key] = std::stoi(str_val);
} catch (...) {
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
}
str_key = "";
str_val = "";
in_token = false;
continue;
}
if (has_key == false) {
str_key += json[i];
} else {
str_val += json[i];
}
}
}
}
return result;
}
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words;
// first split the text into words
{
std::string str = text;
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
std::regex re(pat);
std::smatch m;
while (std::regex_search(str, m, re)) {
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
}
// find the longest tokens that form the words:
std::vector<gpt_vocab::id> tokens;
for (const auto & word : words) {
if (word.size() == 0) continue;
int i = 0;
int n = word.size();
while (i < n) {
int j = n;
while (j > i) {
auto it = vocab.token_to_id.find(word.substr(i, j-i));
if (it != vocab.token_to_id.end()) {
tokens.push_back(it->second);
i = j;
break;
}
--j;
}
if (i == n) {
break;
}
if (j == i) {
auto sub = word.substr(i, 1);
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
tokens.push_back(vocab.token_to_id.at(sub));
} else {
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
}
++i;
}
}
}
return tokens;
}
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
vocab.token_to_id = ::json_parse(fname);
for (const auto & kv : vocab.token_to_id) {
vocab.id_to_token[kv.second] = kv.first;
}
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
// print the vocabulary
//for (auto kv : vocab.token_to_id) {
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
//}
return true;
}
gpt_vocab::id gpt_sample_top_k_top_p(
const gpt_vocab & vocab,
const float * logits,
int top_k,
double top_p,
double temp,
std::mt19937 & rng) {
int n_logits = vocab.id_to_token.size();
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
logits_id.reserve(n_logits);
{
const double scale = 1.0/temp;
for (int i = 0; i < n_logits; ++i) {
logits_id.push_back(std::make_pair(logits[i]*scale, i));
}
}
// find the top K tokens
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
return a.first > b.first;
});
logits_id.resize(top_k);
double maxl = -INFINITY;
for (const auto & kv : logits_id) {
maxl = std::max(maxl, kv.first);
}
// compute probs for the top K tokens
std::vector<double> probs;
probs.reserve(logits_id.size());
double sum = 0.0;
for (const auto & kv : logits_id) {
double p = exp(kv.first - maxl);
probs.push_back(p);
sum += p;
}
// normalize the probs
for (auto & p : probs) {
p /= sum;
}
if (top_p < 1.0f) {
double cumsum = 0.0f;
for (int i = 0; i < top_k; i++) {
cumsum += probs[i];
if (cumsum >= top_p) {
top_k = i + 1;
probs.resize(top_k);
logits_id.resize(top_k);
break;
}
}
cumsum = 1.0/cumsum;
for (int i = 0; i < (int) probs.size(); i++) {
probs[i] *= cumsum;
}
}
//printf("\n");
//for (int i = 0; i < (int) probs.size(); i++) {
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
//}
//exit(0);
std::discrete_distribution<> dist(probs.begin(), probs.end());
int idx = dist(rng);
return logits_id[idx].second;
}
bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) { bool read_wav(const std::string & fname, std::vector<float>& pcmf32, std::vector<std::vector<float>>& pcmf32s, bool stereo) {
drwav wav; drwav wav;
std::vector<uint8_t> wav_data; // used for pipe input from stdin std::vector<uint8_t> wav_data; // used for pipe input from stdin
@ -160,3 +479,27 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
return true; return true;
} }
float similarity(const std::string & s0, const std::string & s1) {
const size_t len0 = s0.size() + 1;
const size_t len1 = s1.size() + 1;
std::vector<int> col(len1, 0);
std::vector<int> prevCol(len1, 0);
for (size_t i = 0; i < len1; i++) {
prevCol[i] = i;
}
for (size_t i = 0; i < len0; i++) {
col[0] = i;
for (size_t j = 1; j < len1; j++) {
col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
}
col.swap(prevCol);
}
const float dist = prevCol[len1 - 1];
return 1.0f - (dist / std::max(s0.size(), s1.size()));
}

View File

@ -1,10 +1,44 @@
// Various helper functions and utilities
#pragma once #pragma once
// needs to match WHISPER_SAMPLE_RATE #include <string>
#include <map>
#include <vector>
#include <random>
#include <thread>
#define COMMON_SAMPLE_RATE 16000 #define COMMON_SAMPLE_RATE 16000
#include <vector> //
#include <string> // CLI argument parsing
//
struct gpt_params {
int32_t seed = -1; // RNG seed
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
int32_t n_predict = 200; // new tokens to predict
// sampling parameters
int32_t top_k = 40;
float top_p = 0.9f;
float temp = 0.9f;
int32_t n_batch = 8; // batch size for prompt processing
std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
std::string prompt;
};
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
std::string gpt_random_prompt(std::mt19937 & rng);
//
// Vocab utils
//
std::string trim(const std::string & s); std::string trim(const std::string & s);
@ -13,6 +47,52 @@ std::string replace(
const std::string & from, const std::string & from,
const std::string & to); const std::string & to);
struct gpt_vocab {
using id = int32_t;
using token = std::string;
std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
};
// poor-man's JSON parsing
std::map<std::string, int32_t> json_parse(const std::string & fname);
// split text into tokens
//
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
//
// Regex (Python):
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
//
// Regex (C++):
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
//
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
// load the tokens from encoder.json
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
// sample next token given probabilities for each embedding
//
// - consider only the top K tokens
// - from them, consider only the top tokens with cumulative probability > P
//
// TODO: not sure if this implementation is correct
// TODO: temperature is not implemented
//
gpt_vocab::id gpt_sample_top_k_top_p(
const gpt_vocab & vocab,
const float * logits,
int top_k,
double top_p,
double temp,
std::mt19937 & rng);
//
// Audio utils
//
// Read WAV audio file and store the PCM data into pcmf32 // Read WAV audio file and store the PCM data into pcmf32
// The sample rate of the audio must be equal to COMMON_SAMPLE_RATE // The sample rate of the audio must be equal to COMMON_SAMPLE_RATE
// If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM // If stereo flag is set and the audio has 2 channels, the pcmf32s will contain 2 channel PCM
@ -38,3 +118,5 @@ bool vad_simple(
float freq_thold, float freq_thold,
bool verbose); bool verbose);
// compute similarity between two strings using Levenshtein distance
float similarity(const std::string & s0, const std::string & s1);

View File

@ -145,7 +145,15 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
var db = event.target.result; var db = event.target.result;
var tx = db.transaction(['models'], 'readwrite'); var tx = db.transaction(['models'], 'readwrite');
var os = tx.objectStore('models'); var os = tx.objectStore('models');
var rq = null;
try {
var rq = os.put(data, url); var rq = os.put(data, url);
} catch (e) {
cbPrint('loadRemote: failed to store "' + url + '" in the IndexedDB: \n' + e);
cbCancel();
return;
}
rq.onsuccess = function (event) { rq.onsuccess = function (event) {
cbPrint('loadRemote: "' + url + '" stored in the IndexedDB'); cbPrint('loadRemote: "' + url + '" stored in the IndexedDB');
@ -180,7 +188,6 @@ function loadRemote(url, dst, size_mb, cbProgress, cbReady, cbCancel, cbPrint) {
rq.onabort = function (event) { rq.onabort = function (event) {
cbPrint('loadRemote: failed to open IndexedDB: abort'); cbPrint('loadRemote: failed to open IndexedDB: abort');
cbCancel();
}; };
} }

View File

@ -352,6 +352,37 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_
return true; return true;
} }
char *escape_double_quotes_and_backslashes(const char *str) {
if (str == NULL) {
return NULL;
}
size_t escaped_length = strlen(str) + 1;
for (size_t i = 0; str[i] != '\0'; i++) {
if (str[i] == '"' || str[i] == '\\') {
escaped_length++;
}
}
char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
if (escaped == NULL) {
return NULL;
}
size_t pos = 0;
for (size_t i = 0; str[i] != '\0'; i++) {
if (str[i] == '"' || str[i] == '\\') {
escaped[pos++] = '\\';
}
escaped[pos++] = str[i];
}
// no need to set zero due to calloc() being used prior
return escaped;
}
bool output_csv(struct whisper_context * ctx, const char * fname) { bool output_csv(struct whisper_context * ctx, const char * fname) {
std::ofstream fout(fname); std::ofstream fout(fname);
if (!fout.is_open()) { if (!fout.is_open()) {
@ -367,47 +398,15 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
const char * text = whisper_full_get_segment_text(ctx, i); const char * text = whisper_full_get_segment_text(ctx, i);
const int64_t t0 = whisper_full_get_segment_t0(ctx, i); const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
const int64_t t1 = whisper_full_get_segment_t1(ctx, i); const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
char * text_escaped = escape_double_quotes_and_backslashes(text);
//need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds. //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds.
fout << 10 * t0 << "," << 10 * t1 << ",\"" << text << "\"\n"; fout << 10 * t0 << "," << 10 * t1 << ",\"" << text_escaped << "\"\n";
} }
return true; return true;
} }
char *escape_double_quotes(const char *str) {
if (str == NULL) {
return NULL;
}
size_t escaped_length = strlen(str) + 1;
for (size_t i = 0; str[i] != '\0'; i++) {
if (str[i] == '"') {
escaped_length++;
}
}
char *escaped = (char *)calloc(escaped_length, 1); // pre-zeroed
if (escaped == NULL) {
return NULL;
}
size_t pos = 0;
for (size_t i = 0; str[i] != '\0'; i++) {
if (str[i] == '"') {
escaped[pos++] = '\\';
escaped[pos++] = '"';
} else {
escaped[pos++] = str[i];
}
}
// no need to set zero due to calloc() being used prior
return escaped;
}
bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) { bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
std::ofstream fout(fname); std::ofstream fout(fname);
int indent = 0; int indent = 0;
@ -451,7 +450,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
auto value_s = [&](const char *name, const char *val, bool end = false) { auto value_s = [&](const char *name, const char *val, bool end = false) {
start_value(name); start_value(name);
char * val_escaped = escape_double_quotes(val); char * val_escaped = escape_double_quotes_and_backslashes(val);
fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n"); fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n");
free(val_escaped); free(val_escaped);
}; };
@ -497,7 +496,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper
value_i("layer", whisper_model_n_text_layer(ctx), true); value_i("layer", whisper_model_n_text_layer(ctx), true);
end_obj(); end_obj();
value_i("mels", whisper_model_n_mels(ctx)); value_i("mels", whisper_model_n_mels(ctx));
value_i("f16", whisper_model_f16(ctx), true); value_i("ftype", whisper_model_ftype(ctx), true);
end_obj(); end_obj();
start_obj("params"); start_obj("params");
value_s("model", params.model.c_str()); value_s("model", params.model.c_str());

View File

@ -0,0 +1,6 @@
set(TARGET quantize)
add_executable(${TARGET} quantize.cpp)
include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE common whisper ${CMAKE_THREAD_LIBS_INIT})

View File

@ -0,0 +1,3 @@
# quantize
Tool for integer quantization of Whisper `ggml` model files

View File

@ -0,0 +1,215 @@
#include "ggml.h"
#include "common.h"
#include "common-ggml.h"
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <regex>
// default hparams (Whisper tiny)
struct whisper_hparams {
int32_t n_vocab = 51864;
int32_t n_audio_ctx = 1500;
int32_t n_audio_state = 384;
int32_t n_audio_head = 6;
int32_t n_audio_layer = 4;
int32_t n_text_ctx = 448;
int32_t n_text_state = 384;
int32_t n_text_head = 6;
int32_t n_text_layer = 4;
int32_t n_mels = 80;
int32_t f16 = 1;
};
struct whisper_filters {
int32_t n_mel;
int32_t n_fft;
std::vector<float> data;
};
// quantize a model
bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_ftype ftype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
auto finp = std::ifstream(fname_inp, std::ios::binary);
if (!finp) {
fprintf(stderr, "%s: failed to open '%s' for reading\n", __func__, fname_inp.c_str());
return false;
}
auto fout = std::ofstream(fname_out, std::ios::binary);
if (!fout) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname_out.c_str());
return false;
}
// verify magic
{
uint32_t magic;
finp.read((char *) &magic, sizeof(magic));
if (magic != 0x67676d6c) {
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname_inp.c_str());
return false;
}
fout.write((char *) &magic, sizeof(magic));
}
whisper_hparams hparams;
// load hparams
{
finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
finp.read((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
finp.read((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
finp.read((char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
finp.read((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
finp.read((char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
finp.read((char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
finp.read((char *) &hparams.f16, sizeof(hparams.f16));
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state);
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
fout.write((char *) &hparams.n_audio_state, sizeof(hparams.n_audio_state));
fout.write((char *) &hparams.n_audio_head, sizeof(hparams.n_audio_head));
fout.write((char *) &hparams.n_audio_layer, sizeof(hparams.n_audio_layer));
fout.write((char *) &hparams.n_text_ctx, sizeof(hparams.n_text_ctx));
fout.write((char *) &hparams.n_text_state, sizeof(hparams.n_text_state));
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
fout.write((char *) &ftype, sizeof(hparams.f16));
}
// load mel filters
{
whisper_filters filters;
finp.read ((char *) &filters.n_mel, sizeof(filters.n_mel));
fout.write((char *) &filters.n_mel, sizeof(filters.n_mel));
finp.read ((char *) &filters.n_fft, sizeof(filters.n_fft));
fout.write((char *) &filters.n_fft, sizeof(filters.n_fft));
filters.data.resize(filters.n_mel * filters.n_fft);
finp.read ((char *) filters.data.data(), filters.data.size() * sizeof(float));
fout.write((char *) filters.data.data(), filters.data.size() * sizeof(float));
}
// load vocab
{
int32_t n_vocab = 0;
finp.read ((char *) &n_vocab, sizeof(n_vocab));
fout.write((char *) &n_vocab, sizeof(n_vocab));
//if (n_vocab != hparams.n_vocab) {
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
// __func__, fname_inp.c_str(), n_vocab, hparams.n_vocab);
// return false;
//}
std::string word;
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
finp.read ((char *) &len, sizeof(len));
fout.write((char *) &len, sizeof(len));
word.resize(len);
finp.read ((char *) word.data(), len);
fout.write((char *) word.data(), len);
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
}
}
// regexes of tensor names to not be quantized
const std::vector<std::string> to_skip = {
//"encoder.*",
"encoder.conv1.bias",
"encoder.conv2.bias",
"encoder.positional_embedding",
"decoder.positional_embedding",
};
if (!ggml_common_quantize_0(finp, fout, ftype, { ".*" }, to_skip)) {
fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
return false;
}
finp.close();
fout.close();
return true;
}
int main(int argc, char ** argv) {
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
ggml_print_ftypes(stderr);
return 1;
}
// needed to initialize f16 tables
{
struct ggml_init_params params = { 0, NULL, false };
struct ggml_context * ctx = ggml_init(params);
ggml_free(ctx);
}
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
const ggml_ftype ftype = ggml_parse_ftype(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
int64_t t_quantize_us = 0;
// load the model
{
const int64_t t_start_us = ggml_time_us();
if (!whisper_model_quantize(fname_inp, fname_out, ggml_ftype(ftype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
t_quantize_us = ggml_time_us() - t_start_us;
}
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
printf("\n");
printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}
return 0;
}

View File

@ -35,6 +35,15 @@
<br><br> <br><br>
<b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |
<br><br>
<hr> <hr>
Select the model you would like to use, click the "Start" button and start speaking Select the model you would like to use, click the "Start" button and start speaking
@ -45,6 +54,10 @@
Whisper model: <span id="model-whisper-status"></span> Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button> <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button> <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<span id="fetch-whisper-progress"></span> <span id="fetch-whisper-progress"></span>
<!-- <!--
@ -162,11 +175,17 @@
let urls = { let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin', 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin', 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
}; };
let sizes = { let sizes = {
'tiny.en': 75, 'tiny.en': 75,
'base.en': 142, 'base.en': 142,
'tiny-en-q5_1': 31,
'base-en-q5_1': 57,
}; };
let url = urls[model]; let url = urls[model];
@ -177,6 +196,10 @@
document.getElementById('fetch-whisper-tiny-en').style.display = 'none'; document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none'; document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... '; document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
cbProgress = function(p) { cbProgress = function(p) {
@ -188,6 +211,10 @@
var el; var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = ''; el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
}; };

View File

@ -1,4 +1,4 @@
if (WHISPER_SUPPORT_SDL2) if (WHISPER_SDL2)
# stream # stream
set(TARGET stream) set(TARGET stream)
add_executable(${TARGET} stream.cpp) add_executable(${TARGET} stream.cpp)

View File

@ -383,6 +383,7 @@ int main(int argc, char ** argv) {
} }
} }
} }
fflush(stdout);
} }
} }

View File

@ -1,4 +1,4 @@
if (WHISPER_SUPPORT_SDL2) if (WHISPER_SDL2)
# talk-llama # talk-llama
set(TARGET talk-llama) set(TARGET talk-llama)
#add_executable(${TARGET} talk-llama.cpp llama.cpp) #add_executable(${TARGET} talk-llama.cpp llama.cpp)

View File

@ -21,12 +21,17 @@
#if defined(_POSIX_MAPPED_FILES) #if defined(_POSIX_MAPPED_FILES)
#include <sys/mman.h> #include <sys/mman.h>
#endif #endif
#if defined(_POSIX_MEMLOCK_RANGE)
#include <sys/resource.h>
#endif
#endif #endif
#endif #endif
#if defined(_WIN32) #if defined(_WIN32)
#define WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN
#ifndef NOMINMAX
#define NOMINMAX #define NOMINMAX
#endif
#include <windows.h> #include <windows.h>
#include <io.h> #include <io.h>
#include <stdio.h> // for _fseeki64 #include <stdio.h> // for _fseeki64
@ -41,8 +46,12 @@
} while (0) } while (0)
#ifdef __GNUC__ #ifdef __GNUC__
#ifdef __MINGW32__
__attribute__((format(gnu_printf, 1, 2)))
#else
__attribute__((format(printf, 1, 2))) __attribute__((format(printf, 1, 2)))
#endif #endif
#endif
static std::string format(const char * fmt, ...) { static std::string format(const char * fmt, ...) {
va_list ap, ap2; va_list ap, ap2;
va_start(ap, fmt); va_start(ap, fmt);
@ -55,7 +64,7 @@ static std::string format(const char * fmt, ...) {
va_end(ap2); va_end(ap2);
va_end(ap); va_end(ap);
return std::string(buf.data(), size); return std::string(buf.data(), size);
}; }
struct llama_file { struct llama_file {
// use FILE * so we don't have to re-open the file to mmap // use FILE * so we don't have to re-open the file to mmap
@ -162,7 +171,7 @@ struct llama_mmap {
#ifdef _POSIX_MAPPED_FILES #ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true; static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file) { llama_mmap(struct llama_file * file, bool prefetch = true) {
size = file->size; size = file->size;
int fd = fileno(file->fp); int fd = fileno(file->fp);
int flags = MAP_SHARED; int flags = MAP_SHARED;
@ -170,17 +179,18 @@ struct llama_mmap {
flags |= MAP_POPULATE; flags |= MAP_POPULATE;
#endif #endif
addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0); addr = mmap(NULL, file->size, PROT_READ, flags, fd, 0);
close(fd);
if (addr == MAP_FAILED) { if (addr == MAP_FAILED) {
throw format("mmap failed: %s", strerror(errno)); throw format("mmap failed: %s", strerror(errno));
} }
if (prefetch) {
// Advise the kernel to preload the mapped memory // Advise the kernel to preload the mapped memory
if (madvise(addr, file->size, MADV_WILLNEED)) { if (madvise(addr, file->size, MADV_WILLNEED)) {
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n", fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
strerror(errno)); strerror(errno));
} }
} }
}
~llama_mmap() { ~llama_mmap() {
munmap(addr, size); munmap(addr, size);
@ -188,14 +198,13 @@ struct llama_mmap {
#elif defined(_WIN32) #elif defined(_WIN32)
static constexpr bool SUPPORTED = true; static constexpr bool SUPPORTED = true;
llama_mmap(struct llama_file * file) { llama_mmap(struct llama_file * file, bool prefetch = true) {
size = file->size; size = file->size;
HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp)); HANDLE hFile = (HANDLE) _get_osfhandle(_fileno(file->fp));
HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL); HANDLE hMapping = CreateFileMappingA(hFile, NULL, PAGE_READONLY, 0, 0, NULL);
DWORD error = GetLastError(); DWORD error = GetLastError();
CloseHandle(hFile);
if (hMapping == NULL) { if (hMapping == NULL) {
throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str()); throw format("CreateFileMappingA failed: %s", llama_format_win_err(error).c_str());
@ -209,6 +218,8 @@ struct llama_mmap {
throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()); throw format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str());
} }
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
if (prefetch) {
// Advise the kernel to preload the mapped memory // Advise the kernel to preload the mapped memory
WIN32_MEMORY_RANGE_ENTRY range; WIN32_MEMORY_RANGE_ENTRY range;
range.VirtualAddress = addr; range.VirtualAddress = addr;
@ -218,6 +229,10 @@ struct llama_mmap {
llama_format_win_err(GetLastError()).c_str()); llama_format_win_err(GetLastError()).c_str());
} }
} }
#else
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
}
~llama_mmap() { ~llama_mmap() {
if (!UnmapViewOfFile(addr)) { if (!UnmapViewOfFile(addr)) {
@ -291,8 +306,18 @@ struct llama_mlock {
if (!mlock(addr, size)) { if (!mlock(addr, size)) {
return true; return true;
} else { } else {
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n" MLOCK_SUGGESTION, char* errmsg = std::strerror(errno);
size, this->size, std::strerror(errno)); bool suggest = (errno == ENOMEM);
// Check if the resource limit is fine after all
struct rlimit lock_limit;
if (suggest && getrlimit(RLIMIT_MEMLOCK, &lock_limit))
suggest = false;
if (suggest && (lock_limit.rlim_max > lock_limit.rlim_cur + size))
suggest = false;
fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
return false; return false;
} }
} }
@ -338,8 +363,8 @@ struct llama_mlock {
// Hopefully a megabyte is enough overhead: // Hopefully a megabyte is enough overhead:
size_t increment = size + 1048576; size_t increment = size + 1048576;
// The minimum must be <= the maximum, so we need to increase both: // The minimum must be <= the maximum, so we need to increase both:
min_ws_size += size; min_ws_size += increment;
max_ws_size += size; max_ws_size += increment;
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) { if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n", fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
llama_format_win_err(GetLastError()).c_str()); llama_format_win_err(GetLastError()).c_str());
@ -380,4 +405,29 @@ struct llama_buffer {
delete[] addr; delete[] addr;
} }
}; };
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
struct llama_ctx_buffer {
uint8_t * addr = NULL;
size_t size = 0;
void resize(size_t size) {
if (addr) {
ggml_cuda_host_free(addr);
}
addr = (uint8_t *) ggml_cuda_host_malloc(size);
this->size = size;
}
~llama_ctx_buffer() {
if (addr) {
ggml_cuda_host_free(addr);
}
}
};
#else
typedef llama_buffer llama_ctx_buffer;
#endif
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -39,12 +39,16 @@ extern "C" {
typedef struct llama_token_data { typedef struct llama_token_data {
llama_token id; // token id llama_token id; // token id
float logit; // log-odds of the token
float p; // probability of the token float p; // probability of the token
float plog; // log probability of the token
} llama_token_data; } llama_token_data;
typedef struct llama_token_data_array {
llama_token_data * data;
size_t size;
bool sorted;
} llama_token_data_array;
typedef void (*llama_progress_callback)(float progress, void *ctx); typedef void (*llama_progress_callback)(float progress, void *ctx);
struct llama_context_params { struct llama_context_params {
@ -65,6 +69,20 @@ extern "C" {
void * progress_callback_user_data; void * progress_callback_user_data;
}; };
// model file types
enum llama_ftype {
LLAMA_FTYPE_ALL_F32 = 0,
LLAMA_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
LLAMA_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
// LLAMA_FTYPE_MOSTLY_Q4_3 (6) support has been removed
LLAMA_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
LLAMA_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
};
LLAMA_API struct llama_context_params llama_context_default_params(); LLAMA_API struct llama_context_params llama_context_default_params();
LLAMA_API bool llama_mmap_supported(); LLAMA_API bool llama_mmap_supported();
@ -82,27 +100,46 @@ extern "C" {
// TODO: not great API - very likely to change // TODO: not great API - very likely to change
// Returns 0 on success // Returns 0 on success
// nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
LLAMA_API int llama_model_quantize( LLAMA_API int llama_model_quantize(
const char * fname_inp, const char * fname_inp,
const char * fname_out, const char * fname_out,
int itype); enum llama_ftype ftype,
int nthread);
// Returns the KV cache that will contain the context for the // Apply a LoRA adapter to a loaded model
// ongoing prediction with the model. // path_base_model is the path to a higher quality model to use as a base for
LLAMA_API const uint8_t * llama_get_kv_cache(struct llama_context * ctx); // the layers modified by the adapter. Can be NULL to use the current loaded model.
// The model needs to be reloaded before applying a new adapter, otherwise the adapter
// Returns the size of the KV cache // will be applied on top of the previous one
LLAMA_API size_t llama_get_kv_cache_size(struct llama_context * ctx); // Returns 0 on success
LLAMA_API int llama_apply_lora_from_file(
struct llama_context * ctx,
const char * path_lora,
const char * path_base_model,
int n_threads);
// Returns the number of tokens in the KV cache // Returns the number of tokens in the KV cache
LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx); LLAMA_API int llama_get_kv_cache_token_count(struct llama_context * ctx);
// Sets the KV cache containing the current context for the model // Sets the current rng seed.
LLAMA_API void llama_set_kv_cache( LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, int seed);
struct llama_context * ctx,
const uint8_t * kv_cache, // Returns the size in bytes of the state (rng, logits, embedding and kv_cache)
size_t n_size, LLAMA_API size_t llama_get_state_size(struct llama_context * ctx);
int n_token_count);
// Copies the state to the specified destination address.
// Destination needs to have allocated enough memory.
// Returns the number of bytes copied
LLAMA_API size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest);
// Set the state reading from the specified address
// Returns the number of bytes read
LLAMA_API size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src);
// Save/load session file
LLAMA_API size_t llama_load_session_file(struct llama_context * ctx, const char * path_session, llama_token * tokens_out, size_t n_token_capacity, size_t * n_token_count_out);
LLAMA_API size_t llama_save_session_file(struct llama_context * ctx, const char * path_session, const llama_token * tokens, size_t n_token_count);
// Run the llama inference to obtain the logits and probabilities for the next token. // Run the llama inference to obtain the logits and probabilities for the next token.
// tokens + n_tokens is the provided batch of new tokens to process // tokens + n_tokens is the provided batch of new tokens to process
@ -148,16 +185,52 @@ extern "C" {
// Special tokens // Special tokens
LLAMA_API llama_token llama_token_bos(); LLAMA_API llama_token llama_token_bos();
LLAMA_API llama_token llama_token_eos(); LLAMA_API llama_token llama_token_eos();
LLAMA_API llama_token llama_token_nl();
// TODO: improve the last_n_tokens interface ? // Sampling functions
LLAMA_API llama_token llama_sample_top_p_top_k(
struct llama_context * ctx, /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix.
const llama_token * last_n_tokens_data, LLAMA_API void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float penalty);
int last_n_tokens_size,
int top_k, /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details.
float top_p, LLAMA_API void llama_sample_frequency_and_presence_penalties(struct llama_context * ctx, llama_token_data_array * candidates, llama_token * last_tokens, size_t last_tokens_size, float alpha_frequency, float alpha_presence);
float temp,
float repeat_penalty); /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
LLAMA_API void llama_sample_softmax(struct llama_context * ctx, llama_token_data_array * candidates);
/// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_k(struct llama_context * ctx, llama_token_data_array * candidates, int k, size_t min_keep = 1);
/// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751
LLAMA_API void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
/// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.
LLAMA_API void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array * candidates, float z, size_t min_keep = 1);
/// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.
LLAMA_API void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * candidates, float p, size_t min_keep = 1);
LLAMA_API void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates, float temp);
/// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int m, float * mu);
/// @details Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words.
/// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.
/// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text.
/// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates.
/// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal.
LLAMA_API llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu);
/// @details Selects the token with the highest probability.
LLAMA_API llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_data_array * candidates);
/// @details Randomly selects a token from the candidates based on their probabilities.
LLAMA_API llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_array * candidates);
// Performance information // Performance information
LLAMA_API void llama_print_timings(struct llama_context * ctx); LLAMA_API void llama_print_timings(struct llama_context * ctx);
@ -170,4 +243,15 @@ extern "C" {
} }
#endif #endif
// Internal API to be implemented by llama.cpp and used by tests/benchmarks only
#ifdef LLAMA_API_INTERNAL
#include <vector>
#include <string>
struct ggml_tensor;
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif
#endif // LLAMA_H #endif // LLAMA_H

View File

@ -1,12 +0,0 @@
// Internal header to be included by llama.cpp and tests/benchmarks only.
#ifndef LLAMA_INTERNAL_H
#define LLAMA_INTERNAL_H
#include <vector>
#include <string>
struct ggml_tensor;
std::vector<std::pair<std::string, struct ggml_tensor *>>& llama_internal_get_tensor_map(struct llama_context * ctx);
#endif // LLAMA_INTERNAL_H

View File

@ -487,11 +487,37 @@ int main(int argc, char ** argv) {
{ {
auto logits = llama_get_logits(ctx_llama); auto logits = llama_get_logits(ctx_llama);
auto n_vocab = llama_n_vocab(ctx_llama);
logits[llama_token_eos()] = 0; logits[llama_token_eos()] = 0;
id = llama_sample_top_p_top_k(ctx_llama, std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < n_vocab; token_id++) {
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
// apply repeat penalty
const float nl_logit = logits[llama_token_nl()];
llama_sample_repetition_penalty(ctx_llama, &candidates_p,
embd_inp.data() + std::max(0, n_past - repeat_last_n), embd_inp.data() + std::max(0, n_past - repeat_last_n),
repeat_last_n, top_k, top_p, temp, repeat_penalty); repeat_last_n, repeat_penalty);
logits[llama_token_nl()] = nl_logit;
if (temp <= 0) {
// Greedy sampling
id = llama_sample_token_greedy(ctx_llama, &candidates_p);
} else {
// Temperature sampling
llama_sample_top_k(ctx_llama, &candidates_p, top_k);
llama_sample_top_p(ctx_llama, &candidates_p, top_p);
llama_sample_temperature(ctx_llama, &candidates_p, temp);
id = llama_sample_token(ctx_llama, &candidates_p);
}
} }
if (id != llama_token_eos()) { if (id != llama_token_eos()) {

View File

@ -13,6 +13,7 @@ include(DefaultTargetOptions)
target_link_libraries(${TARGET} PRIVATE target_link_libraries(${TARGET} PRIVATE
whisper whisper
common
) )
unset(EXTRA_FLAGS) unset(EXTRA_FLAGS)

View File

@ -1,4 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "common-ggml.h"
#include "gpt-2.h" #include "gpt-2.h"
#include <cmath> #include <cmath>
@ -14,150 +16,6 @@
/////////////////////// GPT-2 BEGIN ///////////////////////// /////////////////////// GPT-2 BEGIN /////////////////////////
//
// Vocab utils
//
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words;
// first split the text into words
{
std::string str = text;
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
std::regex re(pat);
std::smatch m;
while (std::regex_search(str, m, re)) {
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
}
// find the longest tokens that form the words:
std::vector<gpt_vocab::id> tokens;
for (const auto & word : words) {
if (word.size() == 0) continue;
int i = 0;
int n = word.size();
while (i < n) {
int j = n;
while (j > i) {
auto it = vocab.token_to_id.find(word.substr(i, j-i));
if (it != vocab.token_to_id.end()) {
tokens.push_back(it->second);
i = j;
break;
}
--j;
}
if (i == n) {
break;
}
if (j == i) {
auto sub = word.substr(i, 1);
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
tokens.push_back(vocab.token_to_id.at(sub));
} else {
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
}
++i;
}
}
}
return tokens;
}
gpt_vocab::id gpt_sample_top_k_top_p(
const gpt_vocab & vocab,
const float * logits,
int top_k,
double top_p,
double temp,
std::mt19937 & rng) {
int n_logits = vocab.id_to_token.size();
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
logits_id.reserve(n_logits);
for (int i = 0; i < n_logits; i++) {
logits_id.push_back(std::make_pair(logits[i], i));
}
// find the top K tokens
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
return a.first > b.first;
});
logits_id.resize(top_k);
// normalize
{
double sum = 0.0f;
for (int i = 0; i < (int)logits_id.size(); i++) {
sum += logits_id[i].first;
}
sum = 1.0/sum;
for (int i = 0; i < (int)logits_id.size(); i++) {
logits_id[i].first *= sum;
}
}
if (top_p < 1.0f) {
{
double cumsum = 0.0f;
for (int i = 0; i < top_k; i++) {
cumsum += logits_id[i].first;
if (cumsum >= top_p) {
logits_id.resize(i+1);
break;
}
}
}
// normalize again
{
double sum = 0.0f;
for (int i = 0; i < (int)logits_id.size(); i++) {
sum += logits_id[i].first;
}
sum = 1.0/sum;
for (int i = 0; i < (int)logits_id.size(); i++) {
logits_id[i].first *= sum;
}
}
}
//printf("\n");
//for (int i = 0; i < (int)logits_id.size(); i++) {
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
//}
//exit(0);
// sample from the obtained distribution
std::vector<double> probs;
probs.reserve(logits_id.size());
for (int i = 0; i < (int) logits_id.size(); i++) {
probs.push_back(logits_id[i].first);
}
std::discrete_distribution<> dist(probs.begin(), probs.end());
int idx = dist(rng);
return logits_id[idx].second;
}
// default hparams (GPT-2 117M) // default hparams (GPT-2 117M)
struct gpt2_hparams { struct gpt2_hparams {
int32_t n_vocab = 50257; int32_t n_vocab = 50257;
@ -165,7 +23,7 @@ struct gpt2_hparams {
int32_t n_embd = 768; int32_t n_embd = 768;
int32_t n_head = 12; int32_t n_head = 12;
int32_t n_layer = 12; int32_t n_layer = 12;
int32_t f16 = 1; int32_t ftype = 1;
}; };
struct gpt2_layer { struct gpt2_layer {
@ -187,7 +45,7 @@ struct gpt2_layer {
struct ggml_tensor * c_mlp_fc_w; struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b; struct ggml_tensor * c_mlp_fc_b;
struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b; struct ggml_tensor * c_mlp_proj_b;
}; };
@ -200,6 +58,7 @@ struct gpt2_model {
struct ggml_tensor * wte; // position embedding struct ggml_tensor * wte; // position embedding
struct ggml_tensor * wpe; // token embedding struct ggml_tensor * wpe; // token embedding
struct ggml_tensor * lm_head; // language model head
std::vector<gpt2_layer> layers; std::vector<gpt2_layer> layers;
@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.f16, sizeof(hparams.f16)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: f16 = %d\n", __func__, hparams.f16); printf("%s: ftype = %d\n", __func__, hparams.ftype);
} }
// load vocab // load vocab
@ -275,9 +134,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
} }
} }
// for the big tensors, we have the option to store the data in 16-bit floats // for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation // in order to save memory and also to speed up the computation
const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32; ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
if (wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return false;
}
auto & ctx = model.ctx; auto & ctx = model.ctx;
@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
const int n_ctx = hparams.n_ctx; const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab; const int n_vocab = hparams.n_vocab;
ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
ctx_size += n_vocab*n_embd*ggml_type_size(wtype); // wte ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
ctx_size += n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype)); // c_attn_attn_w ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
ctx_size += n_layer*( 3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype)); // c_attn_proj_w ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_proj_b ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_fc_w ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
ctx_size += n_layer*( 4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_proj_w ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
ctx_size += (6 + 12*n_layer)*256; // object overhead ctx_size += (6 + 12*n_layer)*256; // object overhead
@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// create the ggml context // create the ggml context
{ {
struct ggml_init_params params; struct ggml_init_params params = {
params.mem_size = ctx_size; .mem_size = ctx_size,
params.mem_buffer = NULL; .mem_buffer = NULL,
.no_alloc = false,
};
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
if (!model.ctx) { if (!model.ctx) {
@ -352,6 +219,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
// map by name // map by name
model.tensors["model/ln_f/g"] = model.ln_f_g; model.tensors["model/ln_f/g"] = model.ln_f_g;
@ -359,6 +227,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
model.tensors["model/wte"] = model.wte; model.tensors["model/wte"] = model.wte;
model.tensors["model/wpe"] = model.wpe; model.tensors["model/wpe"] = model.wpe;
model.tensors["model/lm_head"] = model.lm_head;
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i]; auto & layer = model.layers[i];
@ -369,16 +238,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, 3*n_embd, n_embd); layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
// map by name // map by name
@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans; model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
} }
} }
@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
{ {
size_t total_size = 0; size_t total_size = 0;
bool has_lm_head = false;
while (true) { while (true) {
int32_t n_dims; int32_t n_dims;
int32_t length; int32_t length;
int32_t ftype; int32_t ttype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length)); fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype)); fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (fin.eof()) { if (fin.eof()) {
break; break;
@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
return false; return false;
} }
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t); // for debugging
if (0) {
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
}
if (nelements*bpe != ggml_nbytes(tensor)) { const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe); __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false; return false;
@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor)); fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
//printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); // GPT-2 models share the WTE tensor as the LM head
if (name == "model/wte" && has_lm_head == false) {
memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
}
if (name == "model/lm_head") {
has_lm_head = true;
}
total_size += ggml_nbytes(tensor); total_size += ggml_nbytes(tensor);
} }
@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// - n_threads: number of threads to use // - n_threads: number of threads to use
// - n_past: the context size so far // - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context // - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted probabilities of the next token // - embd_w: the predicted logits for the next token
// //
bool gpt2_eval( bool gpt2_eval(
const gpt2_model & model, const gpt2_model & model,
@ -512,12 +396,12 @@ bool gpt2_eval(
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab; const int n_vocab = hparams.n_vocab;
static size_t buf_size = 640u*1024*1024; static size_t buf_size = 512u*1024*1024;
static void * buf = malloc(buf_size); static void * buf = malloc(buf_size);
if (mem_per_token > 0 && mem_per_token*N > buf_size) { if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
// reallocate // reallocate
buf_size = buf_size_new; buf_size = buf_size_new;
@ -528,12 +412,13 @@ bool gpt2_eval(
} }
} }
struct ggml_init_params params; struct ggml_init_params params = {
params.mem_size = buf_size; /*.mem_size =*/ buf_size,
params.mem_buffer = buf; /*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = {}; struct ggml_cgraph gf = {};
gf.n_threads = n_threads; gf.n_threads = n_threads;
@ -578,7 +463,7 @@ bool gpt2_eval(
// [2304, N] // [2304, N]
{ {
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_attn_attn_w), model.layers[il].c_attn_attn_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -654,11 +539,13 @@ bool gpt2_eval(
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12] // [n_past + N, 64, 12]
struct ggml_tensor * V_trans = struct ggml_tensor * V_trans =
ggml_cpy(ctx0,
ggml_permute(ctx0, ggml_permute(ctx0,
ggml_reshape_3d(ctx0, ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N), n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3); 1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
// KQV = transpose(V) * KQ_soft_max // KQV = transpose(V) * KQ_soft_max
// [64, N, 12] // [64, N, 12]
@ -685,7 +572,7 @@ bool gpt2_eval(
// [768, N] // [768, N]
{ {
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_attn_proj_w), model.layers[il].c_attn_proj_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -722,7 +609,7 @@ bool gpt2_eval(
// cur = fc_w*cur + fc_b // cur = fc_w*cur + fc_b
// [3072, N] // [3072, N]
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w), model.layers[il].c_mlp_fc_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -742,7 +629,7 @@ bool gpt2_eval(
// cur = proj_w*cur + proj_b // cur = proj_w*cur + proj_b
// [768, N] // [768, N]
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
model.layers[il].c_mlp_proj_w_trans, model.layers[il].c_mlp_proj_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -769,12 +656,12 @@ bool gpt2_eval(
} }
// inpL = WTE * inpL // inpL = WTE * inpL
// [ 768, 50257] - model.wte // [ 768, 50257] - model.lm_head
// [ 768, N] - inpL // [ 768, N] - inpL
inpL = ggml_mul_mat(ctx0, model.wte, inpL); inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
// logits -> probs // logits -> probs
inpL = ggml_soft_max(ctx0, inpL); //inpL = ggml_soft_max(ctx0, inpL);
// run the computation // run the computation
ggml_build_forward_expand(&gf, inpL); ggml_build_forward_expand(&gf, inpL);
@ -788,7 +675,7 @@ bool gpt2_eval(
//embd_w.resize(n_vocab*N); //embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
// return result for just the last token // return result just for the last token
embd_w.resize(n_vocab); embd_w.resize(n_vocab);
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
@ -825,7 +712,7 @@ Me too.
int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency()); int32_t n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
// sampling parameters // sampling parameters
int32_t top_k = 40; int32_t top_k = 5;
float top_p = 0.9f; float top_p = 0.9f;
float temp = 1.0f; float temp = 1.0f;
}; };
@ -833,14 +720,14 @@ Me too.
struct gpt2_context * gpt2_init(const char * path_model) { struct gpt2_context * gpt2_init(const char * path_model) {
gpt2_context * ctx = new gpt2_context; gpt2_context * ctx = new gpt2_context;
ctx->rng = std::mt19937(time(NULL)); ctx->rng = std::mt19937(time(nullptr));
// load the model // load the model
{ {
const int64_t t_start_us = ggml_time_us(); const int64_t t_start_us = ggml_time_us();
if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) { if (!gpt2_model_load(path_model, ctx->model, ctx->vocab)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, "gpt-2.bin"); fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
delete ctx; delete ctx;
return nullptr; return nullptr;
} }
@ -885,9 +772,9 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
std::string result; std::string result;
for (int i = embd.size(); i < embd_inp.size() + n_predict; i++) { for (int i = embd.size(); i < (int) embd_inp.size() + n_predict; i++) {
// predict // predict
if (embd.size() > 0) { if (!embd.empty()) {
if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) { if (!gpt2_eval(ctx->model, ctx->n_threads, n_past, embd, embd_w, mem_per_token)) {
printf("gpt-2: failed to generate text\n"); printf("gpt-2: failed to generate text\n");
return ""; return "";
@ -914,10 +801,7 @@ std::string gpt2_gen_text(gpt2_context * ctx, const char * text, int max_tokens)
result += ctx->vocab.id_to_token[embd[0]]; result += ctx->vocab.id_to_token[embd[0]];
// end of text token // end of text token
if (embd.back() == 50256 || if (embd.back() == 50256) {
ctx->vocab.id_to_token[embd.back()] == "." ||
ctx->vocab.id_to_token[embd.back()] == "!" ||
ctx->vocab.id_to_token[embd.back()] == "?") {
break; break;
} }
} }

View File

@ -2,18 +2,12 @@
// TODO: Change to C-style API and move to ./examples for easy reuse. // TODO: Change to C-style API and move to ./examples for easy reuse.
#include "common.h"
#include <vector> #include <vector>
#include <map> #include <map>
#include <string> #include <string>
struct gpt_vocab {
using id = int32_t;
using token = std::string;
std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
};
struct gpt2_context; struct gpt2_context;
struct gpt2_context * gpt2_init(const char * path_model); struct gpt2_context * gpt2_init(const char * path_model);

View File

@ -44,6 +44,15 @@
<br><br> <br><br>
<b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |
<br><br>
<hr> <hr>
Select the models you would like to use and click the "Start" button to begin the conversation Select the models you would like to use and click the "Start" button to begin the conversation
@ -54,6 +63,10 @@
Whisper model: <span id="model-whisper-status"></span> Whisper model: <span id="model-whisper-status"></span>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button> <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button> <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<span id="fetch-whisper-progress"></span> <span id="fetch-whisper-progress"></span>
<!-- <!--
@ -266,11 +279,17 @@
let urls = { let urls = {
'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin', 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin', 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
}; };
let sizes = { let sizes = {
'tiny.en': 75, 'tiny.en': 75,
'base.en': 142, 'base.en': 142,
'tiny-en-q5_1': 31,
'base-en-q5_1': 57,
}; };
let url = urls[model]; let url = urls[model];
@ -281,6 +300,10 @@
document.getElementById('fetch-whisper-tiny-en').style.display = 'none'; document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
document.getElementById('fetch-whisper-base-en').style.display = 'none'; document.getElementById('fetch-whisper-base-en').style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1').style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1').style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... '; document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
cbProgress = function(p) { cbProgress = function(p) {
@ -292,6 +315,10 @@
var el; var el;
el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1'); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = ''; el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
}; };

View File

@ -1,16 +1,8 @@
if (WHISPER_SUPPORT_SDL2) if (WHISPER_SDL2)
# talk # talk
set(TARGET talk) set(TARGET talk)
#add_executable(${TARGET} talk.cpp gpt-2.cpp) add_executable(${TARGET} talk.cpp gpt-2.cpp)
#target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS}) target_link_libraries(${TARGET} PRIVATE common common-sdl whisper ${CMAKE_THREAD_LIBS_INIT})
#target_link_libraries(${TARGET} PRIVATE whisper ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
# TODO: this is temporary
# need to export ggml symbols for MSVC, but too lazy ..
add_executable(${TARGET} talk.cpp gpt-2.cpp ../common.cpp ../common-sdl.cpp ../../ggml.c ../../whisper.cpp)
include(DefaultTargetOptions) include(DefaultTargetOptions)
target_include_directories(${TARGET} PRIVATE ${SDL2_INCLUDE_DIRS} ../../)
target_link_libraries(${TARGET} PRIVATE ${SDL2_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT})
endif () endif ()

View File

@ -1,4 +1,6 @@
#include "ggml.h" #include "ggml.h"
#include "common-ggml.h"
#include "gpt-2.h" #include "gpt-2.h"
#include <cmath> #include <cmath>
@ -14,150 +16,6 @@
/////////////////////// GPT-2 BEGIN ///////////////////////// /////////////////////// GPT-2 BEGIN /////////////////////////
//
// Vocab utils
//
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words;
// first split the text into words
{
std::string str = text;
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
std::regex re(pat);
std::smatch m;
while (std::regex_search(str, m, re)) {
for (auto x : m) {
words.push_back(x);
}
str = m.suffix();
}
}
// find the longest tokens that form the words:
std::vector<gpt_vocab::id> tokens;
for (const auto & word : words) {
if (word.empty()) continue;
int i = 0;
int n = word.size();
while (i < n) {
int j = n;
while (j > i) {
auto it = vocab.token_to_id.find(word.substr(i, j-i));
if (it != vocab.token_to_id.end()) {
tokens.push_back(it->second);
i = j;
break;
}
--j;
}
if (i == n) {
break;
}
if (j == i) {
auto sub = word.substr(i, 1);
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
tokens.push_back(vocab.token_to_id.at(sub));
} else {
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
}
++i;
}
}
}
return tokens;
}
gpt_vocab::id gpt_sample_top_k_top_p(
const gpt_vocab & vocab,
const float * logits,
int top_k,
double top_p,
double /*temp*/,
std::mt19937 & rng) {
int n_logits = vocab.id_to_token.size();
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
logits_id.reserve(n_logits);
for (int i = 0; i < n_logits; i++) {
logits_id.emplace_back(logits[i], i);
}
// find the top K tokens
std::partial_sort(
logits_id.begin(),
logits_id.begin() + top_k, logits_id.end(),
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
return a.first > b.first;
});
logits_id.resize(top_k);
// normalize
{
double sum = 0.0f;
for (int i = 0; i < (int)logits_id.size(); i++) {
sum += logits_id[i].first;
}
sum = 1.0/sum;
for (int i = 0; i < (int)logits_id.size(); i++) {
logits_id[i].first *= sum;
}
}
if (top_p < 1.0f) {
{
double cumsum = 0.0f;
for (int i = 0; i < top_k; i++) {
cumsum += logits_id[i].first;
if (cumsum >= top_p) {
logits_id.resize(i+1);
break;
}
}
}
// normalize again
{
double sum = 0.0f;
for (int i = 0; i < (int)logits_id.size(); i++) {
sum += logits_id[i].first;
}
sum = 1.0/sum;
for (int i = 0; i < (int)logits_id.size(); i++) {
logits_id[i].first *= sum;
}
}
}
//printf("\n");
//for (int i = 0; i < (int) logits_id.size(); i++) {
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), logits_id[i].first);
//}
//exit(0);
// sample from the obtained distribution
std::vector<double> probs;
probs.reserve(logits_id.size());
for (int i = 0; i < (int) logits_id.size(); i++) {
probs.push_back(logits_id[i].first);
}
std::discrete_distribution<> dist(probs.begin(), probs.end());
int idx = dist(rng);
return logits_id[idx].second;
}
// default hparams (GPT-2 117M) // default hparams (GPT-2 117M)
struct gpt2_hparams { struct gpt2_hparams {
int32_t n_vocab = 50257; int32_t n_vocab = 50257;
@ -165,7 +23,7 @@ struct gpt2_hparams {
int32_t n_embd = 768; int32_t n_embd = 768;
int32_t n_head = 12; int32_t n_head = 12;
int32_t n_layer = 12; int32_t n_layer = 12;
int32_t f16 = 1; int32_t ftype = 1;
}; };
struct gpt2_layer { struct gpt2_layer {
@ -187,7 +45,7 @@ struct gpt2_layer {
struct ggml_tensor * c_mlp_fc_w; struct ggml_tensor * c_mlp_fc_w;
struct ggml_tensor * c_mlp_fc_b; struct ggml_tensor * c_mlp_fc_b;
struct ggml_tensor * c_mlp_proj_w_trans; // transposed for efficiency struct ggml_tensor * c_mlp_proj_w;
struct ggml_tensor * c_mlp_proj_b; struct ggml_tensor * c_mlp_proj_b;
}; };
@ -200,6 +58,7 @@ struct gpt2_model {
struct ggml_tensor * wte; // position embedding struct ggml_tensor * wte; // position embedding
struct ggml_tensor * wpe; // token embedding struct ggml_tensor * wpe; // token embedding
struct ggml_tensor * lm_head; // language model head
std::vector<gpt2_layer> layers; std::vector<gpt2_layer> layers;
@ -241,14 +100,14 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.f16, sizeof(hparams.f16)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: f16 = %d\n", __func__, hparams.f16); printf("%s: ftype = %d\n", __func__, hparams.ftype);
} }
// load vocab // load vocab
@ -268,16 +127,21 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
fin.read((char *) &len, sizeof(len)); fin.read((char *) &len, sizeof(len));
word.resize(len); word.resize(len);
fin.read((char *) &word[0], len); fin.read((char *) word.data(), len);
vocab.token_to_id[word] = i; vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word; vocab.id_to_token[i] = word;
} }
} }
// for the big tensors, we have the option to store the data in 16-bit floats // for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation // in order to save memory and also to speed up the computation
const ggml_type wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32; ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
if (wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
__func__, fname.c_str(), model.hparams.ftype);
return false;
}
auto & ctx = model.ctx; auto & ctx = model.ctx;
@ -291,32 +155,33 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
const int n_ctx = hparams.n_ctx; const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab; const int n_vocab = hparams.n_vocab;
ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_g ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
ctx_size += n_embd*ggml_type_size(GGML_TYPE_F32); // ln_f_b ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
ctx_size += n_vocab*n_embd*ggml_type_size(wtype); // wte ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
ctx_size += n_ctx*n_embd*ggml_type_size(GGML_TYPE_F32); // wpe ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_g ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_1_b ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_g ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
ctx_size += n_layer*(n_embd*ggml_type_size(GGML_TYPE_F32)); // ln_2_b ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_size(wtype)); // c_attn_attn_w ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
ctx_size += n_layer*( 3*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_attn_b ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
ctx_size += n_layer*(n_embd*n_embd*ggml_type_size(wtype)); // c_attn_proj_w ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_attn_proj_b ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_fc_w ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
ctx_size += n_layer*( 4*n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_fc_b ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_size(wtype)); // c_mlp_proj_w ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
ctx_size += n_layer*( n_embd*ggml_type_size(GGML_TYPE_F32)); // c_mlp_proj_b ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_k ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
ctx_size += n_ctx*n_layer*n_embd*ggml_type_size(GGML_TYPE_F32); // memory_v ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
ctx_size += (6 + 12*n_layer)*256; // object overhead ctx_size += (6 + 12*n_layer)*256; // object overhead
@ -325,9 +190,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// create the ggml context // create the ggml context
{ {
struct ggml_init_params params; struct ggml_init_params params = {
params.mem_size = ctx_size; .mem_size = ctx_size,
params.mem_buffer = nullptr; .mem_buffer = NULL,
.no_alloc = false,
};
model.ctx = ggml_init(params); model.ctx = ggml_init(params);
if (!model.ctx) { if (!model.ctx) {
@ -352,6 +219,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
// map by name // map by name
model.tensors["model/ln_f/g"] = model.ln_f_g; model.tensors["model/ln_f/g"] = model.ln_f_g;
@ -359,6 +227,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
model.tensors["model/wte"] = model.wte; model.tensors["model/wte"] = model.wte;
model.tensors["model/wpe"] = model.wpe; model.tensors["model/wpe"] = model.wpe;
model.tensors["model/lm_head"] = model.lm_head;
for (int i = 0; i < n_layer; ++i) { for (int i = 0; i < n_layer; ++i) {
auto & layer = model.layers[i]; auto & layer = model.layers[i];
@ -369,16 +238,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, 3*n_embd, n_embd); layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
layer.c_mlp_proj_w_trans = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
// map by name // map by name
@ -397,7 +266,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w_trans; model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
} }
} }
@ -425,14 +294,16 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
{ {
size_t total_size = 0; size_t total_size = 0;
bool has_lm_head = false;
while (true) { while (true) {
int32_t n_dims; int32_t n_dims;
int32_t length; int32_t length;
int32_t ftype; int32_t ttype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims)); fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length)); fin.read(reinterpret_cast<char *>(&length), sizeof(length));
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype)); fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (fin.eof()) { if (fin.eof()) {
break; break;
@ -448,7 +319,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
std::string name(length, 0); std::string name(length, 0);
fin.read(&name[0], length); fin.read(&name[0], length);
if (model.tensors.find(name) == model.tensors.end()) { if (model.tensors.find(name.data()) == model.tensors.end()) {
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data()); fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false; return false;
} }
@ -461,13 +332,18 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]); __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
return false; return false;
} }
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t); // for debugging
if (0) {
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
}
if (nelements*bpe != ggml_nbytes(tensor)) { const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe); __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false; return false;
@ -475,7 +351,15 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor)); fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
//printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); // GPT-2 models share the WTE tensor as the LM head
if (name == "model/wte" && has_lm_head == false) {
memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
}
if (name == "model/lm_head") {
has_lm_head = true;
}
total_size += ggml_nbytes(tensor); total_size += ggml_nbytes(tensor);
} }
@ -493,7 +377,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
// - n_threads: number of threads to use // - n_threads: number of threads to use
// - n_past: the context size so far // - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context // - embd_inp: the embeddings of the tokens in the context
// - embd_w: the predicted probabilities of the next token // - embd_w: the predicted logits for the next token
// //
bool gpt2_eval( bool gpt2_eval(
const gpt2_model & model, const gpt2_model & model,
@ -512,12 +396,12 @@ bool gpt2_eval(
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
const int n_vocab = hparams.n_vocab; const int n_vocab = hparams.n_vocab;
static size_t buf_size = 5640ull*1024*1024; static size_t buf_size = 512u*1024*1024;
static void * buf = malloc(buf_size); static void * buf = malloc(buf_size);
if (mem_per_token > 0 && mem_per_token*N > buf_size) { if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new); //printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
// reallocate // reallocate
buf_size = buf_size_new; buf_size = buf_size_new;
@ -528,12 +412,13 @@ bool gpt2_eval(
} }
} }
struct ggml_init_params params; struct ggml_init_params params = {
params.mem_size = buf_size; /*.mem_size =*/ buf_size,
params.mem_buffer = buf; /*.mem_buffer =*/ buf,
/*.no_alloc =*/ false,
};
struct ggml_context * ctx0 = ggml_init(params); struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph gf = {}; struct ggml_cgraph gf = {};
gf.n_threads = n_threads; gf.n_threads = n_threads;
@ -578,7 +463,7 @@ bool gpt2_eval(
// [2304, N] // [2304, N]
{ {
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_attn_attn_w), model.layers[il].c_attn_attn_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -654,11 +539,13 @@ bool gpt2_eval(
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12] // [n_past + N, 64, 12]
struct ggml_tensor * V_trans = struct ggml_tensor * V_trans =
ggml_cpy(ctx0,
ggml_permute(ctx0, ggml_permute(ctx0,
ggml_reshape_3d(ctx0, ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N), n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3); 1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
// KQV = transpose(V) * KQ_soft_max // KQV = transpose(V) * KQ_soft_max
// [64, N, 12] // [64, N, 12]
@ -685,7 +572,7 @@ bool gpt2_eval(
// [768, N] // [768, N]
{ {
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_attn_proj_w), model.layers[il].c_attn_proj_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -722,7 +609,7 @@ bool gpt2_eval(
// cur = fc_w*cur + fc_b // cur = fc_w*cur + fc_b
// [3072, N] // [3072, N]
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
ggml_transpose(ctx0, model.layers[il].c_mlp_fc_w), model.layers[il].c_mlp_fc_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -742,7 +629,7 @@ bool gpt2_eval(
// cur = proj_w*cur + proj_b // cur = proj_w*cur + proj_b
// [768, N] // [768, N]
cur = ggml_mul_mat(ctx0, cur = ggml_mul_mat(ctx0,
model.layers[il].c_mlp_proj_w_trans, model.layers[il].c_mlp_proj_w,
cur); cur);
cur = ggml_add(ctx0, cur = ggml_add(ctx0,
@ -769,12 +656,12 @@ bool gpt2_eval(
} }
// inpL = WTE * inpL // inpL = WTE * inpL
// [ 768, 50257] - model.wte // [ 768, 50257] - model.lm_head
// [ 768, N] - inpL // [ 768, N] - inpL
inpL = ggml_mul_mat(ctx0, model.wte, inpL); inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
// logits -> probs // logits -> probs
inpL = ggml_soft_max(ctx0, inpL); //inpL = ggml_soft_max(ctx0, inpL);
// run the computation // run the computation
ggml_build_forward_expand(&gf, inpL); ggml_build_forward_expand(&gf, inpL);
@ -788,7 +675,7 @@ bool gpt2_eval(
//embd_w.resize(n_vocab*N); //embd_w.resize(n_vocab*N);
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
// return result for just the last token // return result just for the last token
embd_w.resize(n_vocab); embd_w.resize(n_vocab);
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);

View File

@ -2,18 +2,12 @@
// TODO: Change to C-style API and move to ./examples for easy reuse. // TODO: Change to C-style API and move to ./examples for easy reuse.
#include "common.h"
#include <vector> #include <vector>
#include <map> #include <map>
#include <string> #include <string>
struct gpt_vocab {
using id = int32_t;
using token = std::string;
std::map<token, id> token_to_id;
std::map<id, token> id_to_token;
};
struct gpt2_context; struct gpt2_context;
struct gpt2_context * gpt2_init(const char * path_model); struct gpt2_context * gpt2_init(const char * path_model);

View File

@ -9,4 +9,6 @@ To use:
5. Select the "release" active build variant, and use Android Studio to run and deploy to your device. 5. Select the "release" active build variant, and use Android Studio to run and deploy to your device.
[^1]: I recommend the tiny or base models for running on an Android device. [^1]: I recommend the tiny or base models for running on an Android device.
(PS: Do not move this android project folder individually to other folders, because this android project folder depends on the files of the whole project.)
<img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg"> <img width="300" alt="image" src="https://user-images.githubusercontent.com/1670775/221613663-a17bf770-27ef-45ab-9a46-a5f99ba65d2a.jpg">

View File

@ -31,9 +31,9 @@ endif()
set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \ set_target_properties(${TARGET} PROPERTIES LINK_FLAGS " \
--bind \ --bind \
-s USE_PTHREADS=1 \ -s USE_PTHREADS=1 \
-s PTHREAD_POOL_SIZE=8 \ -s PTHREAD_POOL_SIZE_STRICT=0 \
-s INITIAL_MEMORY=1500MB \ -s INITIAL_MEMORY=2000MB \
-s TOTAL_MEMORY=1500MB \ -s TOTAL_MEMORY=2000MB \
-s FORCE_FILESYSTEM=1 \ -s FORCE_FILESYSTEM=1 \
-s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \ -s EXPORTED_RUNTIME_METHODS=\"['print', 'printErr', 'ccall', 'cwrap']\" \
${EXTRA_FLAGS} \ ${EXTRA_FLAGS} \

View File

@ -10,6 +10,12 @@ std::thread g_worker;
std::vector<struct whisper_context *> g_contexts(4, nullptr); std::vector<struct whisper_context *> g_contexts(4, nullptr);
static inline int mpow2(int n) {
int p = 1;
while (p <= n) p *= 2;
return p/2;
}
EMSCRIPTEN_BINDINGS(whisper) { EMSCRIPTEN_BINDINGS(whisper) {
emscripten::function("init", emscripten::optional_override([](const std::string & path_model) { emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
if (g_worker.joinable()) { if (g_worker.joinable()) {
@ -43,7 +49,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
} }
})); }));
emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, bool translate) { emscripten::function("full_default", emscripten::optional_override([](size_t index, const emscripten::val & audio, const std::string & lang, int nthreads, bool translate) {
if (g_worker.joinable()) { if (g_worker.joinable()) {
g_worker.join(); g_worker.join();
} }
@ -66,7 +72,7 @@ EMSCRIPTEN_BINDINGS(whisper) {
params.print_special = false; params.print_special = false;
params.translate = translate; params.translate = translate;
params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en"; params.language = whisper_is_multilingual(g_contexts[index]) ? lang.c_str() : "en";
params.n_threads = std::min(8, (int) std::thread::hardware_concurrency()); params.n_threads = std::min(nthreads, std::min(16, mpow2(std::thread::hardware_concurrency())));
params.offset_ms = 0; params.offset_ms = 0;
std::vector<float> pcmf32; std::vector<float> pcmf32;

View File

@ -40,21 +40,42 @@
Note that the computation is quite heavy and may take a few seconds to complete.<br> Note that the computation is quite heavy and may take a few seconds to complete.<br>
The transcription results will be displayed in the text area below.<br><br> The transcription results will be displayed in the text area below.<br><br>
<b>Important: your browser must support WASM SIMD instructions for this to work.</b> <b>Important:</b>
<ul>
<li>your browser must support WASM SIMD instructions for this to work</li>
<li>Firefox cannot load files larger than 256 MB - use Chrome instead</li>
</ul>
<br><br><hr> <b>More examples:</b>
<a href="https://whisper.ggerganov.com/">main</a> |
<a href="https://whisper.ggerganov.com/bench">bench</a> |
<a href="https://whisper.ggerganov.com/stream">stream</a> |
<a href="https://whisper.ggerganov.com/command">command</a> |
<a href="https://whisper.ggerganov.com/talk">talk</a> |
<hr>
<div id="model"> <div id="model">
Whisper model: <span id="model-whisper-status"></span> Whisper models: <span id="model-whisper-status"></span><br><br>
<button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button> <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
<button id="fetch-whisper-tiny" onclick="loadWhisper('tiny')">tiny (75 MB)</button> <button id="fetch-whisper-tiny" onclick="loadWhisper('tiny')">tiny (75 MB)</button>
<button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button> <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
<button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button> <button id="fetch-whisper-base" onclick="loadWhisper('base')">base (142 MB)</button>
<button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button> <button id="fetch-whisper-small-en" onclick="loadWhisper('small.en')">small.en (466 MB)</button>
<button id="fetch-whisper-small" onclick="loadWhisper('small')">small (466 MB)</button> <button id="fetch-whisper-small" onclick="loadWhisper('small')">small (466 MB)</button>
<span id="fetch-whisper-progress"></span>
<input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" /> <input type="file" id="whisper-file" name="file" onchange="loadFile(event, 'whisper.bin')" />
<br><br>
Quantized models:<br><br>
<button id="fetch-whisper-tiny-en-q5_1" onclick="loadWhisper('tiny-en-q5_1')">tiny.en (Q5_1, 31 MB)</button>
<button id="fetch-whisper-tiny-q5_1" onclick="loadWhisper('tiny-q5_1')">tiny (Q5_1, 31 MB)</button>
<button id="fetch-whisper-base-en-q5_1" onclick="loadWhisper('base-en-q5_1')">base.en (Q5_1, 57 MB)</button>
<button id="fetch-whisper-base-q5_1" onclick="loadWhisper('base-q5_1')">base (Q5_1, 57 MB)</button>
<button id="fetch-whisper-small-en-q5_1" onclick="loadWhisper('small-en-q5_1')">small.en (Q5_1, 182 MB)</button>
<button id="fetch-whisper-small-q5_1" onclick="loadWhisper('small-q5_1')">small (Q5_1, 182 MB)</button><br>
<button id="fetch-whisper-medium-en-q5_0" onclick="loadWhisper('medium-en-q5_0')">medium.en (Q5_0, 515 MB)</button>
<button id="fetch-whisper-medium-q5_0" onclick="loadWhisper('medium-q5_0')">medium (Q5_0, 515 MB)</button>
<button id="fetch-whisper-large-q5_0" onclick="loadWhisper('large-q5_0')">large (Q5_0, 1030 MB)</button>
<span id="fetch-whisper-progress"></span>
</div> </div>
<br> <br>
@ -161,6 +182,12 @@
<option value="yi">Yiddish</option> <option value="yi">Yiddish</option>
</select> </select>
</td> </td>
<!-- Slider to select number of threads between 1 and 16 -->
<td>
Threads:
<input type="range" id="threads" name="threads" min="1" max="16" value="8" onchange="changeThreads(this.value)" />
<span id="threads-value">8</span>
</td>
<td> <td>
<button onclick="onProcess(false);">Transcribe</button> <button onclick="onProcess(false);">Transcribe</button>
</td> </td>
@ -263,11 +290,13 @@
Module.FS_createDataFile("/", fname, buf, true, true); Module.FS_createDataFile("/", fname, buf, true, true);
model_whisper = fname; //model_whisper = fname;
document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!'; document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length); printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
document.getElementById('model').innerHTML = 'Model fetched: ' + model_whisper;
} }
function loadFile(event, fname) { function loadFile(event, fname) {
@ -292,6 +321,17 @@
document.getElementById('fetch-whisper-tiny' ).style.display = 'none'; document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
document.getElementById('fetch-whisper-base' ).style.display = 'none'; document.getElementById('fetch-whisper-base' ).style.display = 'none';
document.getElementById('fetch-whisper-small' ).style.display = 'none'; document.getElementById('fetch-whisper-small' ).style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-tiny-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-base-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-small-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
document.getElementById('fetch-whisper-medium-q5_0' ).style.display = 'none';
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
document.getElementById('whisper-file' ).style.display = 'none'; document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name; document.getElementById('model-whisper-status' ).innerHTML = 'loaded model: ' + file.name;
} }
@ -304,6 +344,16 @@
'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin', 'base': 'https://whisper.ggerganov.com/ggml-model-whisper-base.bin',
'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin', 'small.en': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en.bin',
'small': 'https://whisper.ggerganov.com/ggml-model-whisper-small.bin', 'small': 'https://whisper.ggerganov.com/ggml-model-whisper-small.bin',
'tiny-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en-q5_1.bin',
'tiny-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny-q5_1.bin',
'base-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en-q5_1.bin',
'base-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-base-q5_1.bin',
'small-en-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small.en-q5_1.bin',
'small-q5_1': 'https://whisper.ggerganov.com/ggml-model-whisper-small-q5_1.bin',
'medium-en-q5_0':'https://whisper.ggerganov.com/ggml-model-whisper-medium.en-q5_0.bin',
'medium-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-medium-q5_0.bin',
'large-q5_0': 'https://whisper.ggerganov.com/ggml-model-whisper-large-q5_0.bin',
}; };
let sizes = { let sizes = {
@ -313,6 +363,16 @@
'base': 142, 'base': 142,
'small.en': 466, 'small.en': 466,
'small': 466, 'small': 466,
'tiny-en-q5_1': 31,
'tiny-q5_1': 31,
'base-en-q5_1': 57,
'base-q5_1': 57,
'small-en-q5_1': 182,
'small-q5_1': 182,
'medium-en-q5_0': 515,
'medium-q5_0': 515,
'large-q5_0': 1030,
}; };
let url = urls[model]; let url = urls[model];
@ -327,6 +387,17 @@
document.getElementById('fetch-whisper-tiny' ).style.display = 'none'; document.getElementById('fetch-whisper-tiny' ).style.display = 'none';
document.getElementById('fetch-whisper-base' ).style.display = 'none'; document.getElementById('fetch-whisper-base' ).style.display = 'none';
document.getElementById('fetch-whisper-small' ).style.display = 'none'; document.getElementById('fetch-whisper-small' ).style.display = 'none';
document.getElementById('fetch-whisper-tiny-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-tiny-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-base-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-base-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-small-en-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-small-q5_1' ).style.display = 'none';
document.getElementById('fetch-whisper-medium-en-q5_0').style.display = 'none';
document.getElementById('fetch-whisper-medium-q5_0' ).style.display = 'none';
document.getElementById('fetch-whisper-large-q5_0' ).style.display = 'none';
document.getElementById('whisper-file' ).style.display = 'none'; document.getElementById('whisper-file' ).style.display = 'none';
document.getElementById('model-whisper-status').innerHTML = 'loading model: ' + model; document.getElementById('model-whisper-status').innerHTML = 'loading model: ' + model;
@ -337,12 +408,24 @@
cbCancel = function() { cbCancel = function() {
var el; var el;
el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-tiny-en' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-base-en' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-small-en'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-tiny' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-base' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('fetch-whisper-small' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-tiny-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-base-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small-en-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-small-q5_1' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-medium-en-q5_0'); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-medium-q5_0' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('fetch-whisper-large-q5_0' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block'; el = document.getElementById('whisper-file' ); if (el) el.style.display = 'inline-block';
el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = ''; el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
}; };
@ -354,7 +437,8 @@
// audio file // audio file
// //
const kMaxAudio_s = 120; const kMaxAudio_s = 30*60;
const kMaxRecording_s = 2*60;
const kSampleRate = 16000; const kSampleRate = 16000;
window.AudioContext = window.AudioContext || window.webkitAudioContext; window.AudioContext = window.AudioContext || window.webkitAudioContext;
@ -423,7 +507,7 @@
doRecording = false; doRecording = false;
} }
// record up to kMaxAudio_s seconds of audio from the microphone // record up to kMaxRecording_s seconds of audio from the microphone
// check if doRecording is false every 1000 ms and stop recording if so // check if doRecording is false every 1000 ms and stop recording if so
// update progress information // update progress information
function startRecording() { function startRecording() {
@ -479,9 +563,9 @@
printTextarea('js: audio recorded, size: ' + audio.length); printTextarea('js: audio recorded, size: ' + audio.length);
// truncate to first 30 seconds // truncate to first 30 seconds
if (audio.length > kMaxAudio_s*kSampleRate) { if (audio.length > kMaxRecording_s*kSampleRate) {
audio = audio.slice(0, kMaxAudio_s*kSampleRate); audio = audio.slice(0, kMaxRecording_s*kSampleRate);
printTextarea('js: truncated audio to first ' + kMaxAudio_s + ' seconds'); printTextarea('js: truncated audio to first ' + kMaxRecording_s + ' seconds');
} }
setAudio(audio); setAudio(audio);
}); });
@ -509,24 +593,31 @@
}); });
} }
document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxAudio_s) + '%'; document.getElementById('progress-bar').style.width = (100*(Date.now() - startTime)/1000/kMaxRecording_s) + '%';
document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxAudio_s).toFixed(0) + '%'; document.getElementById('progress-text').innerHTML = (100*(Date.now() - startTime)/1000/kMaxRecording_s).toFixed(0) + '%';
}, 1000); }, 1000);
printTextarea('js: recording ...'); printTextarea('js: recording ...');
setTimeout(function() { setTimeout(function() {
if (doRecording) { if (doRecording) {
printTextarea('js: recording stopped after ' + kMaxAudio_s + ' seconds'); printTextarea('js: recording stopped after ' + kMaxRecording_s + ' seconds');
stopRecording(); stopRecording();
} }
}, kMaxAudio_s*1000); }, kMaxRecording_s*1000);
} }
// //
// transcribe // transcribe
// //
var nthreads = 8;
function changeThreads(value) {
nthreads = value;
document.getElementById('threads-value').innerHTML = nthreads;
}
function onProcess(translate) { function onProcess(translate) {
if (!instance) { if (!instance) {
instance = Module.init('whisper.bin'); instance = Module.init('whisper.bin');
@ -553,7 +644,7 @@
printTextarea(''); printTextarea('');
setTimeout(function() { setTimeout(function() {
var ret = Module.full_default(instance, audio, document.getElementById('language').value, translate); var ret = Module.full_default(instance, audio, document.getElementById('language').value, nthreads, translate);
console.log('js: full_default returned: ' + ret); console.log('js: full_default returned: ' + ret);
if (ret) { if (ret) {
printTextarea("js: whisper returned: " + ret); printTextarea("js: whisper returned: " + ret);

45
extra/quantize-all.sh Executable file
View File

@ -0,0 +1,45 @@
#!/bin/bash
printf "Usage: $0 <upload>"
if [ $# -ne 1 ]; then
printf "\nError: Invalid number of arguments\n"
exit 1
fi
qtype0="q5_0"
qtype1="q5_1"
upload="$1"
cd `dirname $0`
cd ../
./quantize ./models/ggml-tiny.en.bin ./models/ggml-tiny.en-${qtype1}.bin ${qtype1}
./quantize ./models/ggml-tiny.bin ./models/ggml-tiny-${qtype1}.bin ${qtype1}
./quantize ./models/ggml-base.en.bin ./models/ggml-base.en-${qtype1}.bin ${qtype1}
./quantize ./models/ggml-base.bin ./models/ggml-base-${qtype1}.bin ${qtype1}
./quantize ./models/ggml-small.en.bin ./models/ggml-small.en-${qtype1}.bin ${qtype1}
./quantize ./models/ggml-small.bin ./models/ggml-small-${qtype1}.bin ${qtype1}
./quantize ./models/ggml-medium.en.bin ./models/ggml-medium.en-${qtype0}.bin ${qtype0}
./quantize ./models/ggml-medium.bin ./models/ggml-medium-${qtype0}.bin ${qtype0}
./quantize ./models/ggml-large.bin ./models/ggml-large-${qtype0}.bin ${qtype0}
if [ "$upload" == "1" ]; then
scp ./models/ggml-tiny.en-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny.en-${qtype1}.bin
scp ./models/ggml-tiny-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-tiny-${qtype1}.bin
scp ./models/ggml-base.en-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-base.en-${qtype1}.bin
scp ./models/ggml-base-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-base-${qtype1}.bin
scp ./models/ggml-small.en-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-small.en-${qtype1}.bin
scp ./models/ggml-small-${qtype1}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-small-${qtype1}.bin
scp ./models/ggml-medium.en-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium.en-${qtype0}.bin
scp ./models/ggml-medium-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-medium-${qtype0}.bin
scp ./models/ggml-large-${qtype0}.bin root@linode0:/mnt/Data/ggml/ggml-model-whisper-large-${qtype0}.bin
fi

10
extra/sync-ggml.sh Executable file
View File

@ -0,0 +1,10 @@
#!/bin/bash
cp -rpv ../ggml/src/ggml.c ./ggml.c
cp -rpv ../ggml/src/ggml-cuda.cu ./ggml-cuda.cu
cp -rpv ../ggml/src/ggml-cuda.h ./ggml-cuda.h
cp -rpv ../ggml/include/ggml/ggml.h ./ggml.h
cp -rpv ../ggml/examples/common.h ./examples/common.h
cp -rpv ../ggml/examples/common.cpp ./examples/common.cpp
cp -rpv ../ggml/examples/common-ggml.h ./examples/common-ggml.h
cp -rpv ../ggml/examples/common-ggml.cpp ./examples/common-ggml.cpp

365
ggml-cuda.cu Normal file
View File

@ -0,0 +1,365 @@
#include <stdint.h>
#include <stdio.h>
#include <cuda_fp16.h>
#include <atomic>
#include "ggml-cuda.h"
typedef uint16_t ggml_fp16_t;
static_assert(sizeof(__half) == sizeof(ggml_fp16_t), "wrong fp16 size");
#define QK4_0 32
typedef struct {
float d; // delta
uint8_t qs[QK4_0 / 2]; // nibbles / quants
} block_q4_0;
static_assert(sizeof(block_q4_0) == sizeof(float) + QK4_0 / 2, "wrong q4_0 block size/padding");
#define QK4_1 32
typedef struct {
float d; // delta
float m; // min
uint8_t qs[QK4_1 / 2]; // nibbles / quants
} block_q4_1;
static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK4_1 / 2, "wrong q4_1 block size/padding");
#define QK4_2 16
typedef struct {
__half d; // delta
uint8_t qs[QK4_2 / 2]; // nibbles / quants
} block_q4_2;
static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
#define QK5_0 32
typedef struct {
__half d; // delta
uint8_t qh[4]; // 5-th bit of quants
uint8_t qs[QK5_0 / 2]; // nibbles / quants
} block_q5_0;
static_assert(sizeof(block_q5_0) == sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_0 / 2, "wrong q5_0 block size/padding");
#define QK5_1 32
typedef struct {
__half d; // delta
__half m; // min
uint32_t qh; // 5-th bit of quants
uint8_t qs[QK5_1 / 2]; // nibbles / quants
} block_q5_1;
static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_fp16_t) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
#define QK8_0 32
typedef struct {
float d; // delta
int8_t qs[QK8_0]; // quants
} block_q8_0;
static_assert(sizeof(block_q8_0) == sizeof(float) + QK8_0, "wrong q8_0 block size/padding");
static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
const block_q4_0 * x = (const block_q4_0 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const uint8_t * pp = x[i].qs;
for (int l = 0; l < QK4_0; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;
y[i*QK4_0 + l + 0] = v0;
y[i*QK4_0 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
const block_q4_1 * x = (const block_q4_1 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const float m = x[i].m;
const uint8_t * pp = x[i].qs;
for (int l = 0; l < QK4_1; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = vi0*d + m;
const float v1 = vi1*d + m;
y[i*QK4_1 + l + 0] = v0;
y[i*QK4_1 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q4_2(const void * vx, float * y) {
const block_q4_2 * x = (const block_q4_2 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const uint8_t * pp = x[i].qs;
for (int l = 0; l < QK4_2; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vi0 = vi & 0xf;
const int8_t vi1 = vi >> 4;
const float v0 = (vi0 - 8)*d;
const float v1 = (vi1 - 8)*d;
y[i*QK4_2 + l + 0] = v0;
y[i*QK4_2 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
const block_q5_0 * x = (const block_q5_0 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const uint8_t * pp = x[i].qs;
uint32_t qh;
memcpy(&qh, x[i].qh, sizeof(qh));
for (int l = 0; l < QK5_0; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
const int8_t vi0 = ((vi & 0xf) | vh0);
const int8_t vi1 = ((vi >> 4) | vh1);
const float v0 = (vi0 - 16)*d;
const float v1 = (vi1 - 16)*d;
y[i*QK5_0 + l + 0] = v0;
y[i*QK5_0 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
const block_q5_1 * x = (const block_q5_1 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const float m = x[i].m;
const uint8_t * pp = x[i].qs;
const uint32_t qh = x[i].qh;
for (int l = 0; l < QK5_1; l += 2) {
const uint8_t vi = pp[l/2];
const int8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
const int8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
const int8_t vi0 = (vi & 0xf) | vh0;
const int8_t vi1 = (vi >> 4) | vh1;
const float v0 = vi0*d + m;
const float v1 = vi1*d + m;
y[i*QK5_1 + l + 0] = v0;
y[i*QK5_1 + l + 1] = v1;
}
}
static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
const block_q8_0 * x = (const block_q8_0 *) vx;
const int i = blockIdx.x;
const float d = x[i].d;
const int8_t * pp = x[i].qs;
for (int l = 0; l < QK8_0; l++) {
const int8_t vi = pp[l];
y[i*QK8_0 + l] = vi*d;
}
}
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_0;
dequantize_block_q4_0<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_1;
dequantize_block_q4_1<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK4_2;
dequantize_block_q4_2<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK5_0;
dequantize_block_q5_0<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK5_1;
dequantize_block_q5_1<<<nb, 1, 0, stream>>>(vx, y);
}
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream) {
const int nb = k / QK8_0;
dequantize_block_q8_0<<<nb, 1, 0, stream>>>(vx, y);
}
dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(ggml_type type) {
switch (type) {
case GGML_TYPE_Q4_0:
return dequantize_row_q4_0_cuda;
case GGML_TYPE_Q4_1:
return dequantize_row_q4_1_cuda;
case GGML_TYPE_Q4_2:
return dequantize_row_q4_2_cuda;
case GGML_TYPE_Q5_0:
return dequantize_row_q5_0_cuda;
case GGML_TYPE_Q5_1:
return dequantize_row_q5_1_cuda;
case GGML_TYPE_Q8_0:
return dequantize_row_q8_0_cuda;
default:
return nullptr;
}
}
// buffer pool for cuda
#define MAX_CUDA_BUFFERS 16
struct scoped_spin_lock {
std::atomic_flag& lock;
scoped_spin_lock(std::atomic_flag& lock) : lock(lock) {
while (lock.test_and_set(std::memory_order_acquire)) {
; // spin
}
}
~scoped_spin_lock() {
lock.clear(std::memory_order_release);
}
scoped_spin_lock(const scoped_spin_lock&) = delete;
scoped_spin_lock& operator=(const scoped_spin_lock&) = delete;
};
struct cuda_buffer {
void * ptr = nullptr;
size_t size = 0;
};
static cuda_buffer g_cuda_buffer_pool[MAX_CUDA_BUFFERS];
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
cuda_buffer& b = g_cuda_buffer_pool[i];
if (b.size >= size && b.ptr != nullptr) {
void * ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
b.size = 0;
return ptr;
}
}
void * ptr;
CUDA_CHECK(cudaMalloc((void **) &ptr, size));
*actual_size = size;
return ptr;
}
void ggml_cuda_pool_free(void * ptr, size_t size) {
scoped_spin_lock lock(g_cuda_pool_lock);
for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
cuda_buffer& b = g_cuda_buffer_pool[i];
if (b.ptr == nullptr) {
b.ptr = ptr;
b.size = size;
return;
}
}
fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n");
CUDA_CHECK(cudaFree(ptr));
}
cublasHandle_t g_cublasH = nullptr;
cudaStream_t g_cudaStream = nullptr;
cudaStream_t g_cudaStream2 = nullptr;
cudaEvent_t g_cudaEvent = nullptr;
void ggml_init_cublas() {
if (g_cublasH == nullptr) {
// create cublas handle, bind a stream
CUBLAS_CHECK(cublasCreate(&g_cublasH));
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream, cudaStreamNonBlocking));
CUBLAS_CHECK(cublasSetStream(g_cublasH, g_cudaStream));
// create additional stream and event for synchronization
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStream2, cudaStreamNonBlocking));
CUDA_CHECK(cudaEventCreateWithFlags(&g_cudaEvent, cudaEventDisableTiming));
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, NULL));
}
}
cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream) {
const uint64_t ne0 = src->ne[0];
const uint64_t ne1 = src->ne[1];
const uint64_t nb0 = src->nb[0];
const uint64_t nb1 = src->nb[1];
const uint64_t nb2 = src->nb[2];
const uint64_t nb3 = src->nb[3];
const enum ggml_type type = src->type;
const size_t ts = ggml_type_size(type);
const size_t bs = ggml_blck_size(type);
const void * x = (const void *) ((const char *) src->data + i2*nb2 + i3*nb3);
if (nb0 == ts && nb1 == ts*ne0/bs) {
return cudaMemcpyAsync(dst, x, ne1*nb1, cudaMemcpyHostToDevice, stream);
} else if (nb0 == ts) {
return cudaMemcpy2DAsync(dst, ts*ne0/bs, x, nb1, ts*ne0/bs, ne1, cudaMemcpyHostToDevice, stream);
} else {
for (uint64_t i1 = 0; i1 < ne1; i1++) {
const void * rx = (const void *) ((const char *) x + i1*nb1);
void * rd = (void *) ((char *) dst + i1*ts*ne0/bs);
// pretend the row is a matrix with cols=1
cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyHostToDevice, stream);
if (r != cudaSuccess) return r;
}
return cudaSuccess;
}
}
void * ggml_cuda_host_malloc(size_t size) {
void * ptr;
CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
return ptr;
}
void ggml_cuda_host_free(void * ptr) {
CUDA_CHECK(cudaFreeHost(ptr));
}

54
ggml-cuda.h Normal file
View File

@ -0,0 +1,54 @@
#include <cublas_v2.h>
#include <cuda_runtime.h>
#include "ggml.h"
#ifdef __cplusplus
extern "C" {
#endif
#define CUDA_CHECK(err) \
do { \
cudaError_t err_ = (err); \
if (err_ != cudaSuccess) { \
fprintf(stderr, "CUDA error %d at %s:%d: %s\n", err_, __FILE__, __LINE__, \
cudaGetErrorString(err_)); \
exit(1); \
} \
} while (0)
#define CUBLAS_CHECK(err) \
do { \
cublasStatus_t err_ = (err); \
if (err_ != CUBLAS_STATUS_SUCCESS) { \
fprintf(stderr, "cuBLAS error %d at %s:%d\n", err_, __FILE__, __LINE__); \
exit(1); \
} \
} while (0)
extern cublasHandle_t g_cublasH;
extern cudaStream_t g_cudaStream;
extern cudaStream_t g_cudaStream2;
extern cudaEvent_t g_cudaEvent;
void ggml_init_cublas(void);
void * ggml_cuda_host_malloc(size_t size);
void ggml_cuda_host_free(void * ptr);
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
void ggml_cuda_pool_free(void * ptr, size_t size);
void dequantize_row_q4_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q4_2_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q5_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q5_1_cuda(const void * vx, float * y, int k, cudaStream_t stream);
void dequantize_row_q8_0_cuda(const void * vx, float * y, int k, cudaStream_t stream);
cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src, uint64_t i3, uint64_t i2, cudaStream_t stream);
typedef void (*dequantize_row_q_cuda_t)(const void * x, float * y, int k, cudaStream_t stream);
dequantize_row_q_cuda_t ggml_get_dequantize_row_q_cuda(enum ggml_type type);
#ifdef __cplusplus
}
#endif

4026
ggml.c

File diff suppressed because it is too large Load Diff

287
ggml.h
View File

@ -169,14 +169,27 @@
// //
// //
#ifdef __cplusplus #ifdef GGML_SHARED
extern "C" { # if defined(_WIN32) && !defined(__MINGW32__)
# ifdef GGML_BUILD
# define GGML_API __declspec(dllexport)
# else
# define GGML_API __declspec(dllimport)
# endif
# else
# define GGML_API __attribute__ ((visibility ("default")))
# endif
#else
# define GGML_API
#endif #endif
#include <stdint.h> #include <stdint.h>
#include <stddef.h> #include <stddef.h>
#include <stdbool.h> #include <stdbool.h>
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1
#define GGML_MAX_DIMS 4 #define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096 #define GGML_MAX_NODES 4096
#define GGML_MAX_PARAMS 16 #define GGML_MAX_PARAMS 16
@ -184,6 +197,10 @@ extern "C" {
#define GGML_MAX_OPT 4 #define GGML_MAX_OPT 4
#define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_N_THREADS 4
#ifdef __cplusplus
extern "C" {
#endif
#ifdef __ARM_NEON #ifdef __ARM_NEON
// we use the built-in 16-bit float type // we use the built-in 16-bit float type
typedef __fp16 ggml_fp16_t; typedef __fp16 ggml_fp16_t;
@ -192,24 +209,43 @@ typedef uint16_t ggml_fp16_t;
#endif #endif
// convert FP16 <-> FP32 // convert FP16 <-> FP32
float ggml_fp16_to_fp32(ggml_fp16_t x); GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
ggml_fp16_t ggml_fp32_to_fp16(float x); GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
struct ggml_object; struct ggml_object;
struct ggml_context; struct ggml_context;
enum ggml_type { enum ggml_type {
// explicitly numbered values are used in llama.cpp files
GGML_TYPE_F32 = 0, GGML_TYPE_F32 = 0,
GGML_TYPE_F16 = 1, GGML_TYPE_F16 = 1,
GGML_TYPE_Q4_0 = 2, GGML_TYPE_Q4_0 = 2,
GGML_TYPE_Q4_1 = 3, GGML_TYPE_Q4_1 = 3,
GGML_TYPE_Q4_2 = 4,
// GGML_TYPE_Q4_3 (5) support has been removed
GGML_TYPE_Q5_0 = 6,
GGML_TYPE_Q5_1 = 7,
GGML_TYPE_Q8_0 = 8,
GGML_TYPE_Q8_1 = 9,
GGML_TYPE_I8, GGML_TYPE_I8,
GGML_TYPE_I16, GGML_TYPE_I16,
GGML_TYPE_I32, GGML_TYPE_I32,
GGML_TYPE_COUNT, GGML_TYPE_COUNT,
}; };
// model file types
enum ggml_ftype {
GGML_FTYPE_UNKNOWN = -1,
GGML_FTYPE_ALL_F32 = 0,
GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors
GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors
};
// available tensor operations: // available tensor operations:
enum ggml_op { enum ggml_op {
GGML_OP_NONE = 0, GGML_OP_NONE = 0,
@ -247,6 +283,7 @@ enum ggml_op {
GGML_OP_DIAG_MASK_INF, GGML_OP_DIAG_MASK_INF,
GGML_OP_SOFT_MAX, GGML_OP_SOFT_MAX,
GGML_OP_ROPE, GGML_OP_ROPE,
GGML_OP_ALIBI,
GGML_OP_CONV_1D_1S, GGML_OP_CONV_1D_1S,
GGML_OP_CONV_1D_2S, GGML_OP_CONV_1D_2S,
@ -338,56 +375,66 @@ struct ggml_init_params {
bool no_alloc; // don't allocate memory for the tensor data bool no_alloc; // don't allocate memory for the tensor data
}; };
void ggml_time_init(void); // call this once at the beginning of the program // misc
int64_t ggml_time_ms(void);
int64_t ggml_time_us(void);
int64_t ggml_cycles(void);
int64_t ggml_cycles_per_ms(void);
void ggml_print_object (const struct ggml_object * obj); GGML_API void ggml_time_init(void); // call this once at the beginning of the program
void ggml_print_objects(const struct ggml_context * ctx); GGML_API int64_t ggml_time_ms(void);
GGML_API int64_t ggml_time_us(void);
GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void);
int64_t ggml_nelements(const struct ggml_tensor * tensor); GGML_API void ggml_print_object (const struct ggml_object * obj);
size_t ggml_nbytes (const struct ggml_tensor * tensor); GGML_API void ggml_print_objects(const struct ggml_context * ctx);
int ggml_blck_size (enum ggml_type type); GGML_API int64_t ggml_nelements(const struct ggml_tensor * tensor);
size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block GGML_API size_t ggml_nbytes (const struct ggml_tensor * tensor);
float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
size_t ggml_element_size(const struct ggml_tensor * tensor); GGML_API int ggml_blck_size (enum ggml_type type);
GGML_API size_t ggml_type_size (enum ggml_type type); // size in bytes for all elements in a block
GGML_API float ggml_type_sizef(enum ggml_type type); // ggml_type_size()/ggml_blck_size() as float
struct ggml_context * ggml_init(struct ggml_init_params params); GGML_API const char * ggml_type_name(enum ggml_type type);
void ggml_free(struct ggml_context * ctx);
size_t ggml_used_mem(const struct ggml_context * ctx); GGML_API size_t ggml_element_size(const struct ggml_tensor * tensor);
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API bool ggml_is_quantized(enum ggml_type type);
struct ggml_tensor * ggml_new_tensor( GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype);
// main
GGML_API struct ggml_context * ggml_init(struct ggml_init_params params);
GGML_API void ggml_free(struct ggml_context * ctx);
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int n_dims, int n_dims,
const int64_t *ne); const int64_t *ne);
struct ggml_tensor * ggml_new_tensor_1d( GGML_API struct ggml_tensor * ggml_new_tensor_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int64_t ne0); int64_t ne0);
struct ggml_tensor * ggml_new_tensor_2d( GGML_API struct ggml_tensor * ggml_new_tensor_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int64_t ne0, int64_t ne0,
int64_t ne1); int64_t ne1);
struct ggml_tensor * ggml_new_tensor_3d( GGML_API struct ggml_tensor * ggml_new_tensor_3d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int64_t ne0, int64_t ne0,
int64_t ne1, int64_t ne1,
int64_t ne2); int64_t ne2);
struct ggml_tensor * ggml_new_tensor_4d( GGML_API struct ggml_tensor * ggml_new_tensor_4d(
struct ggml_context * ctx, struct ggml_context * ctx,
enum ggml_type type, enum ggml_type type,
int64_t ne0, int64_t ne0,
@ -395,122 +442,127 @@ struct ggml_tensor * ggml_new_tensor_4d(
int64_t ne2, int64_t ne2,
int64_t ne3); int64_t ne3);
struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value); GGML_API struct ggml_tensor * ggml_new_i32(struct ggml_context * ctx, int32_t value);
struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value); GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i); GGML_API int32_t ggml_get_i32_1d(const struct ggml_tensor * tensor, int i);
void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value); GGML_API void ggml_set_i32_1d(const struct ggml_tensor * tensor, int i, int32_t value);
float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i); GGML_API float ggml_get_f32_1d(const struct ggml_tensor * tensor, int i);
void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value); GGML_API void ggml_set_f32_1d(const struct ggml_tensor * tensor, int i, float value);
void * ggml_get_data (const struct ggml_tensor * tensor); GGML_API void * ggml_get_data (const struct ggml_tensor * tensor);
float * ggml_get_data_f32(const struct ggml_tensor * tensor); GGML_API float * ggml_get_data_f32(const struct ggml_tensor * tensor);
// //
// operations on tensors with backpropagation // operations on tensors with backpropagation
// //
struct ggml_tensor * ggml_dup( GGML_API struct ggml_tensor * ggml_dup(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_add( GGML_API struct ggml_tensor * ggml_add(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_sub( GGML_API struct ggml_tensor * ggml_add_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_mul( GGML_API struct ggml_tensor * ggml_sub(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_div( GGML_API struct ggml_tensor * ggml_mul(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_sqr( GGML_API struct ggml_tensor * ggml_div(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_sqr(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_sqrt( GGML_API struct ggml_tensor * ggml_sqrt(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// return scalar // return scalar
// TODO: compute sum along rows // TODO: compute sum along rows
struct ggml_tensor * ggml_sum( GGML_API struct ggml_tensor * ggml_sum(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// mean along rows // mean along rows
struct ggml_tensor * ggml_mean( GGML_API struct ggml_tensor * ggml_mean(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// if a is the same shape as b, and a is not parameter, return a // if a is the same shape as b, and a is not parameter, return a
// otherwise, return a new tensor: repeat(a) to fit in b // otherwise, return a new tensor: repeat(a) to fit in b
struct ggml_tensor * ggml_repeat( GGML_API struct ggml_tensor * ggml_repeat(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_abs( GGML_API struct ggml_tensor * ggml_abs(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_sgn( GGML_API struct ggml_tensor * ggml_sgn(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_neg( GGML_API struct ggml_tensor * ggml_neg(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_step( GGML_API struct ggml_tensor * ggml_step(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_relu( GGML_API struct ggml_tensor * ggml_relu(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// TODO: double-check this computation is correct // TODO: double-check this computation is correct
struct ggml_tensor * ggml_gelu( GGML_API struct ggml_tensor * ggml_gelu(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_silu( GGML_API struct ggml_tensor * ggml_silu(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// normalize along rows // normalize along rows
// TODO: eps is hardcoded to 1e-5 for now // TODO: eps is hardcoded to 1e-5 for now
struct ggml_tensor * ggml_norm( GGML_API struct ggml_tensor * ggml_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_rms_norm( GGML_API struct ggml_tensor * ggml_rms_norm(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// A: m rows, n columns // A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally) // B: p rows, n columns (i.e. we transpose it internally)
// result is m columns, p rows // result is m columns, p rows
struct ggml_tensor * ggml_mul_mat( GGML_API struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
@ -520,32 +572,32 @@ struct ggml_tensor * ggml_mul_mat(
// //
// in-place, returns view(a) // in-place, returns view(a)
struct ggml_tensor * ggml_scale( GGML_API struct ggml_tensor * ggml_scale(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// a -> b, return view(b) // a -> b, return view(b)
struct ggml_tensor * ggml_cpy( GGML_API struct ggml_tensor * ggml_cpy(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// make contiguous // make contiguous
struct ggml_tensor * ggml_cont( GGML_API struct ggml_tensor * ggml_cont(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// return view(a), b specifies the new shape // return view(a), b specifies the new shape
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape( GGML_API struct ggml_tensor * ggml_reshape(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// return view(a) // return view(a)
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape_2d( GGML_API struct ggml_tensor * ggml_reshape_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int64_t ne0, int64_t ne0,
@ -553,7 +605,7 @@ struct ggml_tensor * ggml_reshape_2d(
// return view(a) // return view(a)
// TODO: when we start computing gradient, make a copy instead of view // TODO: when we start computing gradient, make a copy instead of view
struct ggml_tensor * ggml_reshape_3d( GGML_API struct ggml_tensor * ggml_reshape_3d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int64_t ne0, int64_t ne0,
@ -561,13 +613,13 @@ struct ggml_tensor * ggml_reshape_3d(
int64_t ne2); int64_t ne2);
// offset in bytes // offset in bytes
struct ggml_tensor * ggml_view_1d( GGML_API struct ggml_tensor * ggml_view_1d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int64_t ne0, int64_t ne0,
size_t offset); size_t offset);
struct ggml_tensor * ggml_view_2d( GGML_API struct ggml_tensor * ggml_view_2d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int64_t ne0, int64_t ne0,
@ -575,7 +627,7 @@ struct ggml_tensor * ggml_view_2d(
size_t nb1, // row stride in bytes size_t nb1, // row stride in bytes
size_t offset); size_t offset);
struct ggml_tensor * ggml_view_3d( GGML_API struct ggml_tensor * ggml_view_3d(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int64_t ne0, int64_t ne0,
@ -585,7 +637,7 @@ struct ggml_tensor * ggml_view_3d(
size_t nb2, // slice stride in bytes size_t nb2, // slice stride in bytes
size_t offset); size_t offset);
struct ggml_tensor * ggml_permute( GGML_API struct ggml_tensor * ggml_permute(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int axis0, int axis0,
@ -594,60 +646,69 @@ struct ggml_tensor * ggml_permute(
int axis3); int axis3);
// alias for ggml_permute(ctx, a, 1, 0, 2, 3) // alias for ggml_permute(ctx, a, 1, 0, 2, 3)
struct ggml_tensor * ggml_transpose( GGML_API struct ggml_tensor * ggml_transpose(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
struct ggml_tensor * ggml_get_rows( GGML_API struct ggml_tensor * ggml_get_rows(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
// set elements above the diagonal to -INF // set elements above the diagonal to -INF
// in-place, returns view(a) // in-place, returns view(a)
struct ggml_tensor * ggml_diag_mask_inf( GGML_API struct ggml_tensor * ggml_diag_mask_inf(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past); int n_past);
// in-place, returns view(a) // in-place, returns view(a)
struct ggml_tensor * ggml_soft_max( GGML_API struct ggml_tensor * ggml_soft_max(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// rotary position embedding // rotary position embedding
// in-place, returns view(a) // in-place, returns view(a)
// if mode == 1, skip n_past elements // if mode & 1 == 1, skip n_past elements
// if mode & 2 == 1, GPT-NeoX style
// TODO: avoid creating a new tensor every time // TODO: avoid creating a new tensor every time
struct ggml_tensor * ggml_rope( GGML_API struct ggml_tensor * ggml_rope(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
int n_past, int n_past,
int n_dims, int n_dims,
int mode); int mode);
// alibi position embedding
// in-place, returns view(a)
struct ggml_tensor * ggml_alibi(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_head);
// padding = 1 // padding = 1
// TODO: we don't support extra parameters for now // TODO: we don't support extra parameters for now
// that's why we are hard-coding the stride, padding, and dilation // that's why we are hard-coding the stride, padding, and dilation
// not great .. // not great ..
struct ggml_tensor * ggml_conv_1d_1s( GGML_API struct ggml_tensor * ggml_conv_1d_1s(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_conv_1d_2s( GGML_API struct ggml_tensor * ggml_conv_1d_2s(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b); struct ggml_tensor * b);
struct ggml_tensor * ggml_flash_attn( GGML_API struct ggml_tensor * ggml_flash_attn(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * q, struct ggml_tensor * q,
struct ggml_tensor * k, struct ggml_tensor * k,
struct ggml_tensor * v, struct ggml_tensor * v,
bool masked); bool masked);
struct ggml_tensor * ggml_flash_ff( GGML_API struct ggml_tensor * ggml_flash_ff(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b0, struct ggml_tensor * b0,
@ -659,12 +720,12 @@ struct ggml_tensor * ggml_flash_ff(
typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *);
typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *);
struct ggml_tensor * ggml_map_unary_f32( GGML_API struct ggml_tensor * ggml_map_unary_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
const ggml_unary_op_f32_t fun); const ggml_unary_op_f32_t fun);
struct ggml_tensor * ggml_map_binary_f32( GGML_API struct ggml_tensor * ggml_map_binary_f32(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * b, struct ggml_tensor * b,
@ -674,23 +735,23 @@ struct ggml_tensor * ggml_map_binary_f32(
// automatic differentiation // automatic differentiation
// //
void ggml_set_param( GGML_API void ggml_set_param(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * tensor); struct ggml_tensor * tensor);
void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor); GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor); GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep); GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
void ggml_graph_reset (struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
// print info and performance information for the graph // print info and performance information for the graph
void ggml_graph_print(const struct ggml_cgraph * cgraph); GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
// dump the graph into a file using the dot format // dump the graph into a file using the dot format
void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename); GGML_API void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph * gf, const char * filename);
// //
// optimization // optimization
@ -783,10 +844,10 @@ struct ggml_opt_params {
} lbfgs; } lbfgs;
}; };
struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type); GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
// optimize the function defined by the tensor f // optimize the function defined by the tensor f
enum ggml_opt_result ggml_opt( GGML_API enum ggml_opt_result ggml_opt(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_opt_params params, struct ggml_opt_params params,
struct ggml_tensor * f); struct ggml_tensor * f);
@ -795,26 +856,36 @@ enum ggml_opt_result ggml_opt(
// quantization // quantization
// //
size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist);
size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * hist);
GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist);
// //
// system info // system info
// //
int ggml_cpu_has_avx(void); GGML_API int ggml_cpu_has_avx (void);
int ggml_cpu_has_avx2(void); GGML_API int ggml_cpu_has_avx2 (void);
int ggml_cpu_has_avx512(void); GGML_API int ggml_cpu_has_avx512 (void);
int ggml_cpu_has_fma(void); GGML_API int ggml_cpu_has_avx512_vbmi(void);
int ggml_cpu_has_neon(void); GGML_API int ggml_cpu_has_avx512_vnni(void);
int ggml_cpu_has_arm_fma(void); GGML_API int ggml_cpu_has_fma (void);
int ggml_cpu_has_f16c(void); GGML_API int ggml_cpu_has_neon (void);
int ggml_cpu_has_fp16_va(void); GGML_API int ggml_cpu_has_arm_fma (void);
int ggml_cpu_has_wasm_simd(void); GGML_API int ggml_cpu_has_f16c (void);
int ggml_cpu_has_blas(void); GGML_API int ggml_cpu_has_fp16_va (void);
int ggml_cpu_has_sse3(void); GGML_API int ggml_cpu_has_wasm_simd (void);
int ggml_cpu_has_vsx(void); GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cublas (void);
GGML_API int ggml_cpu_has_clblast (void);
GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_vsx (void);
// //
// Internal types and functions exposed for tests and benchmarks // Internal types and functions exposed for tests and benchmarks
@ -834,7 +905,9 @@ typedef struct {
dequantize_row_q_t dequantize_row_q; dequantize_row_q_t dequantize_row_q;
quantize_row_q_t quantize_row_q; quantize_row_q_t quantize_row_q;
quantize_row_q_t quantize_row_q_reference; quantize_row_q_t quantize_row_q_reference;
quantize_row_q_t quantize_row_q_dot;
vec_dot_q_t vec_dot_q; vec_dot_q_t vec_dot_q;
enum ggml_type vec_dot_type;
} quantize_fns_t; } quantize_fns_t;
quantize_fns_t ggml_internal_get_quantize_fn(size_t i); quantize_fns_t ggml_internal_get_quantize_fn(size_t i);

View File

@ -23,6 +23,7 @@ import json
import code import code
import torch import torch
import numpy as np import numpy as np
from pathlib import Path
from transformers import WhisperForConditionalGeneration from transformers import WhisperForConditionalGeneration
@ -75,16 +76,13 @@ if len(sys.argv) < 4:
print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n") print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n")
sys.exit(1) sys.exit(1)
dir_model = sys.argv[1] dir_model = Path(sys.argv[1])
dir_whisper = sys.argv[2] dir_whisper = Path(sys.argv[2])
dir_out = sys.argv[3] dir_out = Path(sys.argv[3])
with open(dir_model + "/vocab.json", "r", encoding="utf8") as f: encoder = json.load((dir_model / "vocab.json").open("r", encoding="utf8"))
encoder = json.load(f) encoder_added = json.load((dir_model / "added_tokens.json").open( "r", encoding="utf8"))
with open(dir_model + "/added_tokens.json", "r", encoding="utf8") as f: hparams = json.load((dir_model / "config.json").open("r", encoding="utf8") )
encoder_added = json.load(f)
with open(dir_model + "/config.json", "r", encoding="utf8") as f:
hparams = json.load(f)
model = WhisperForConditionalGeneration.from_pretrained(dir_model) model = WhisperForConditionalGeneration.from_pretrained(dir_model)
@ -96,16 +94,15 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as
dir_tokenizer = dir_model dir_tokenizer = dir_model
fname_out = dir_out + "/ggml-model.bin" fname_out = dir_out / "ggml-model.bin"
with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f: tokens = json.load(open(dir_tokenizer / "vocab.json", "r", encoding="utf8"))
tokens = json.load(f)
# use 16-bit or 32-bit floats # use 16-bit or 32-bit floats
use_f16 = True use_f16 = True
if len(sys.argv) > 4: if len(sys.argv) > 4:
use_f16 = False use_f16 = False
fname_out = dir_out + "/ggml-model-f32.bin" fname_out = dir_out / "ggml-model-f32.bin"
fout = open(fname_out, "wb") fout = open(fname_out, "wb")
@ -171,10 +168,9 @@ for name in list_vars.keys():
data = data.astype(np.float16) data = data.astype(np.float16)
# reshape conv bias from [n] to [n, 1] # reshape conv bias from [n] to [n, 1]
if name == "encoder.conv1.bias" or \ if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
name == "encoder.conv2.bias":
data = data.reshape(data.shape[0], 1) data = data.reshape(data.shape[0], 1)
print(" Reshaped variable: " + name + " to shape: ", data.shape) print(" Reshaped variable: " , name , " to shape: ", data.shape)
n_dims = len(data.shape) n_dims = len(data.shape)
print(name, n_dims, data.shape) print(name, n_dims, data.shape)
@ -182,7 +178,7 @@ for name in list_vars.keys():
# looks like the whisper models are in f16 by default # looks like the whisper models are in f16 by default
# so we need to convert the small tensors to f32 until we fully support f16 in ggml # so we need to convert the small tensors to f32 until we fully support f16 in ggml
# ftype == 0 -> float32, ftype == 1 -> float16 # ftype == 0 -> float32, ftype == 1 -> float16
ftype = 1; ftype = 1
if use_f16: if use_f16:
if n_dims < 2 or \ if n_dims < 2 or \
name == "encoder.conv1.bias" or \ name == "encoder.conv1.bias" or \
@ -197,16 +193,16 @@ for name in list_vars.keys():
ftype = 0 ftype = 0
# header # header
str = name.encode('utf-8') str_ = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype)) fout.write(struct.pack("iii", n_dims, len(str_), ftype))
for i in range(n_dims): for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str); fout.write(str_)
# data # data
data.tofile(fout) data.tofile(fout)
fout.close() fout.close()
print("Done. Output file: " + fname_out) print("Done. Output file: " , fname_out)
print("") print("")

View File

@ -40,7 +40,7 @@ import code
import torch import torch
import numpy as np import numpy as np
import base64 import base64
from pathlib import Path
#from transformers import GPTJForCausalLM #from transformers import GPTJForCausalLM
#from transformers import GPT2TokenizerFast #from transformers import GPT2TokenizerFast
@ -194,17 +194,17 @@ if len(sys.argv) < 4:
print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n") print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n")
sys.exit(1) sys.exit(1)
fname_inp = sys.argv[1] fname_inp = Path(sys.argv[1])
dir_whisper = sys.argv[2] dir_whisper = Path(sys.argv[2])
dir_out = sys.argv[3] dir_out = Path(sys.argv[3])
# try to load PyTorch binary data # try to load PyTorch binary data
try: try:
model_bytes = open(fname_inp, "rb").read() model_bytes = open(fname_inp, "rb").read()
with io.BytesIO(model_bytes) as fp: with io.BytesIO(model_bytes) as fp:
checkpoint = torch.load(fp, map_location="cpu") checkpoint = torch.load(fp, map_location="cpu")
except: except Exception:
print("Error: failed to load PyTorch model file: %s" % fname_inp) print("Error: failed to load PyTorch model file:" , fname_inp)
sys.exit(1) sys.exit(1)
hparams = checkpoint["dims"] hparams = checkpoint["dims"]
@ -218,17 +218,17 @@ list_vars = checkpoint["model_state_dict"]
# load mel filters # load mel filters
n_mels = hparams["n_mels"] n_mels = hparams["n_mels"]
with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f: with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
filters = torch.from_numpy(f[f"mel_{n_mels}"]) filters = torch.from_numpy(f[f"mel_{n_mels}"])
#print (filters) #print (filters)
#code.interact(local=locals()) #code.interact(local=locals())
multilingual = hparams["n_vocab"] == 51865 multilingual = hparams["n_vocab"] == 51865
tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual.tiktoken" or "gpt2.tiktoken") tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
# output in the same directory as the model # output in the same directory as the model
fname_out = dir_out + "/ggml-model.bin" fname_out = dir_out / "ggml-model.bin"
with open(tokenizer, "rb") as f: with open(tokenizer, "rb") as f:
contents = f.read() contents = f.read()
@ -238,9 +238,9 @@ with open(tokenizer, "rb") as f:
use_f16 = True use_f16 = True
if len(sys.argv) > 4: if len(sys.argv) > 4:
use_f16 = False use_f16 = False
fname_out = dir_out + "/ggml-model-f32.bin" fname_out = dir_out / "ggml-model-f32.bin"
fout = open(fname_out, "wb") fout = fname_out.open("wb")
fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
fout.write(struct.pack("i", hparams["n_vocab"])) fout.write(struct.pack("i", hparams["n_vocab"]))
@ -273,20 +273,19 @@ for key in tokens:
for name in list_vars.keys(): for name in list_vars.keys():
data = list_vars[name].squeeze().numpy() data = list_vars[name].squeeze().numpy()
print("Processing variable: " + name + " with shape: ", data.shape) print("Processing variable: " , name , " with shape: ", data.shape)
# reshape conv bias from [n] to [n, 1] # reshape conv bias from [n] to [n, 1]
if name == "encoder.conv1.bias" or \ if name in ["encoder.conv1.bias", "encoder.conv2.bias"]:
name == "encoder.conv2.bias":
data = data.reshape(data.shape[0], 1) data = data.reshape(data.shape[0], 1)
print(" Reshaped variable: " + name + " to shape: ", data.shape) print(f" Reshaped variable: {name} to shape: ", data.shape)
n_dims = len(data.shape); n_dims = len(data.shape)
# looks like the whisper models are in f16 by default # looks like the whisper models are in f16 by default
# so we need to convert the small tensors to f32 until we fully support f16 in ggml # so we need to convert the small tensors to f32 until we fully support f16 in ggml
# ftype == 0 -> float32, ftype == 1 -> float16 # ftype == 0 -> float32, ftype == 1 -> float16
ftype = 1; ftype = 1
if use_f16: if use_f16:
if n_dims < 2 or \ if n_dims < 2 or \
name == "encoder.conv1.bias" or \ name == "encoder.conv1.bias" or \
@ -307,16 +306,16 @@ for name in list_vars.keys():
# data = data.transpose() # data = data.transpose()
# header # header
str = name.encode('utf-8') str_ = name.encode('utf-8')
fout.write(struct.pack("iii", n_dims, len(str), ftype)) fout.write(struct.pack("iii", n_dims, len(str_), ftype))
for i in range(n_dims): for i in range(n_dims):
fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
fout.write(str); fout.write(str_)
# data # data
data.tofile(fout) data.tofile(fout)
fout.close() fout.close()
print("Done. Output file: " + fname_out) print("Done. Output file: " , fname_out)
print("") print("")

View File

@ -20,7 +20,7 @@ def linear_to_conv2d_map(state_dict, prefix, local_metadata, strict,
""" """
for k in state_dict: for k in state_dict:
is_attention = all(substr in k for substr in ['attn', '.weight']) is_attention = all(substr in k for substr in ['attn', '.weight'])
is_mlp = any([k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight']]) is_mlp = any(k.endswith(s) for s in ['mlp.0.weight', 'mlp.2.weight'])
if (is_attention or is_mlp) and len(state_dict[k].shape) == 2: if (is_attention or is_mlp) and len(state_dict[k].shape) == 2:
state_dict[k] = state_dict[k][:, :, None, None] state_dict[k] = state_dict[k][:, :, None, None]
@ -42,11 +42,10 @@ class LayerNormANE(LayerNormANEBase):
class MultiHeadAttentionANE(MultiHeadAttention): class MultiHeadAttentionANE(MultiHeadAttention):
def __init__(self, n_state: int, n_head: int): def __init__(self, n_state: int, n_head: int):
super().__init__(n_state, n_head) super().__init__(n_state, n_head)
self.query = nn.Conv2d(n_state, n_state, kernel_size=1)
setattr(self, 'query', nn.Conv2d(n_state, n_state, kernel_size=1)) self.key = nn.Conv2d(n_state, n_state, kernel_size=1, bias=False)
setattr(self, 'key', nn.Conv2d(n_state, n_state, kernel_size=1, bias=False)) self.value = nn.Conv2d(n_state, n_state, kernel_size=1)
setattr(self, 'value', nn.Conv2d(n_state, n_state, kernel_size=1)) self.out = nn.Conv2d(n_state, n_state, kernel_size=1)
setattr(self, 'out', nn.Conv2d(n_state, n_state, kernel_size=1))
def forward(self, def forward(self,
x: Tensor, x: Tensor,
@ -104,30 +103,28 @@ class MultiHeadAttentionANE(MultiHeadAttention):
class ResidualAttentionBlockANE(ResidualAttentionBlock): class ResidualAttentionBlockANE(ResidualAttentionBlock):
def __init__(self, n_state: int, n_head: int, cross_attention: bool = False): def __init__(self, n_state: int, n_head: int, cross_attention: bool = False):
super().__init__(n_state, n_head, cross_attention) super().__init__(n_state, n_head, cross_attention)
self.attn = MultiHeadAttentionANE(n_state, n_head)
setattr(self, 'attn', MultiHeadAttentionANE(n_state, n_head)) self.attn_ln = LayerNormANE(n_state)
setattr(self, 'attn_ln', LayerNormANE(n_state)) self.cross_attn = MultiHeadAttentionANE(n_state, n_head) if cross_attention else None
self.cross_attn_ln = LayerNormANE(n_state) if cross_attention else None
setattr(self, 'cross_attn', MultiHeadAttentionANE(n_state, n_head) if cross_attention else None)
setattr(self, 'cross_attn_ln', LayerNormANE(n_state) if cross_attention else None)
n_mlp = n_state * 4 n_mlp = n_state * 4
setattr(self, 'mlp', nn.Sequential( self.mlp = nn.Sequential(
nn.Conv2d(n_state, n_mlp, kernel_size=1), nn.Conv2d(n_state, n_mlp, kernel_size=1),
nn.GELU(), nn.GELU(),
nn.Conv2d(n_mlp, n_state, kernel_size=1) nn.Conv2d(n_mlp, n_state, kernel_size=1)
)) )
setattr(self, 'mlp_ln', LayerNormANE(n_state)) self.mlp_ln = LayerNormANE(n_state)
class AudioEncoderANE(AudioEncoder): class AudioEncoderANE(AudioEncoder):
def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int): def __init__(self, n_mels: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
super().__init__(n_mels, n_ctx, n_state, n_head, n_layer) super().__init__(n_mels, n_ctx, n_state, n_head, n_layer)
setattr(self, 'blocks', nn.ModuleList( self.blocks = nn.ModuleList(
[ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)] [ResidualAttentionBlockANE(n_state, n_head) for _ in range(n_layer)]
)) )
setattr(self, 'ln_post', LayerNormANE(n_state)) self.ln_post = LayerNormANE(n_state)
def forward(self, x: Tensor): def forward(self, x: Tensor):
""" """
@ -168,10 +165,10 @@ class TextDecoderANE(TextDecoder):
def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int): def __init__(self, n_vocab: int, n_ctx: int, n_state: int, n_head: int, n_layer: int):
super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer) super().__init__(n_vocab, n_ctx, n_state, n_head, n_layer)
setattr(self, 'blocks', nn.ModuleList( self.blocks= nn.ModuleList(
[ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)] [ResidualAttentionBlockANE(n_state, n_head, cross_attention=True) for _ in range(n_layer)]
)) )
setattr(self, 'ln', LayerNormANE(n_state)) self.ln= LayerNormANE(n_state)
def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None): def forward(self, x: Tensor, xa: Tensor, kv_cache: Optional[dict] = None):
""" """
@ -213,20 +210,20 @@ class WhisperANE(Whisper):
def __init__(self, dims: ModelDimensions): def __init__(self, dims: ModelDimensions):
super().__init__(dims) super().__init__(dims)
setattr(self, 'encoder', AudioEncoderANE( self.encoder = AudioEncoderANE(
self.dims.n_mels, self.dims.n_mels,
self.dims.n_audio_ctx, self.dims.n_audio_ctx,
self.dims.n_audio_state, self.dims.n_audio_state,
self.dims.n_audio_head, self.dims.n_audio_head,
self.dims.n_audio_layer, self.dims.n_audio_layer,
)) )
setattr(self, 'decoder', TextDecoderANE( self.decoder = TextDecoderANE(
self.dims.n_vocab, self.dims.n_vocab,
self.dims.n_text_ctx, self.dims.n_text_ctx,
self.dims.n_text_state, self.dims.n_text_state,
self.dims.n_text_head, self.dims.n_text_head,
self.dims.n_text_layer, self.dims.n_text_layer,
)) )
self._register_load_state_dict_pre_hook(linear_to_conv2d_map) self._register_load_state_dict_pre_hook(linear_to_conv2d_map)

View File

@ -13,7 +13,7 @@
# #
# Usage: # Usage:
# #
# ./tests/run-tests.sh <model_name> # ./tests/run-tests.sh <model_name> [threads]
# #
cd `dirname $0` cd `dirname $0`
@ -32,7 +32,7 @@ function list_models {
} }
if [ $# -eq 0 ]; then if [ $# -eq 0 ]; then
printf "Usage: $0 [model]\n\n" printf "Usage: $0 [model] [threads]\n\n"
printf "No model specified. Aborting\n" printf "No model specified. Aborting\n"
list_models list_models
exit 1 exit 1
@ -41,6 +41,11 @@ fi
model=$1 model=$1
main="../main" main="../main"
threads=""
if [ $# -eq 2 ]; then
threads="-t $2"
fi
if [ ! -f ../models/ggml-$model.bin ]; then if [ ! -f ../models/ggml-$model.bin ]; then
printf "Model $model not found. Aborting\n" printf "Model $model not found. Aborting\n"
list_models list_models
@ -105,7 +110,7 @@ function run_lang() {
fi fi
fi fi
$main -m ../models/ggml-$model.bin -f $fname_dst -l $lang -otxt 2> /dev/null $main -m ../models/ggml-$model.bin $threads -f $fname_dst -l $lang -otxt 2> /dev/null
git diff --no-index --word-diff=color --word-diff-regex=. $lang-$i-ref.txt $fname_dst.txt git diff --no-index --word-diff=color --word-diff-regex=. $lang-$i-ref.txt $fname_dst.txt

View File

@ -1,4 +1,3 @@
#define WHISPER_BUILD
#include "whisper.h" #include "whisper.h"
#if WHISPER_USE_COREML #if WHISPER_USE_COREML
#include "coreml/whisper-encoder.h" #include "coreml/whisper-encoder.h"
@ -102,7 +101,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
#define WHISPER_PRINT_DEBUG(...) #define WHISPER_PRINT_DEBUG(...)
#endif #endif
#define WHISPER_USE_FLASH_ATTN //#define WHISPER_USE_FLASH_ATTN
//#define WHISPER_USE_FLASH_FF //#define WHISPER_USE_FLASH_FF
#define WHISPER_MAX_DECODERS 16 #define WHISPER_MAX_DECODERS 16
@ -224,11 +223,11 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
static const size_t MB = 1ull*1024*1024; static const size_t MB = 1ull*1024*1024;
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = { static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
{ MODEL_TINY, 14ull*MB }, { MODEL_TINY, 62ull*MB },
{ MODEL_BASE, 18ull*MB }, { MODEL_BASE, 80ull*MB },
{ MODEL_SMALL, 28ull*MB }, { MODEL_SMALL, 120ull*MB },
{ MODEL_MEDIUM, 36ull*MB }, { MODEL_MEDIUM, 158ull*MB },
{ MODEL_LARGE, 44ull*MB }, { MODEL_LARGE, 198ull*MB },
}; };
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = { static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
@ -255,12 +254,70 @@ static const std::map<e_model, size_t> MEM_REQ_SCRATCH3 = {
{ MODEL_LARGE, 9ull*MB }, { MODEL_LARGE, 9ull*MB },
}; };
static const std::map<e_model, size_t> MEM_REQ_MODEL = { static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
{ GGML_TYPE_F32,
{
{ MODEL_TINY, 74ull*MB }, { MODEL_TINY, 74ull*MB },
{ MODEL_BASE, 142ull*MB }, { MODEL_BASE, 142ull*MB },
{ MODEL_SMALL, 466ull*MB }, { MODEL_SMALL, 466ull*MB },
{ MODEL_MEDIUM, 1464ull*MB }, { MODEL_MEDIUM, 1464ull*MB },
{ MODEL_LARGE, 2952ull*MB }, { MODEL_LARGE, 2952ull*MB },
},
},
{ GGML_TYPE_F16,
{
{ MODEL_TINY, 74ull*MB },
{ MODEL_BASE, 142ull*MB },
{ MODEL_SMALL, 466ull*MB },
{ MODEL_MEDIUM, 1464ull*MB },
{ MODEL_LARGE, 2952ull*MB },
},
},
{ GGML_TYPE_Q4_0,
{
{ MODEL_TINY, 26ull*MB },
{ MODEL_BASE, 50ull*MB },
{ MODEL_SMALL, 154ull*MB },
{ MODEL_MEDIUM, 470ull*MB },
{ MODEL_LARGE, 940ull*MB },
},
},
{ GGML_TYPE_Q4_1,
{
{ MODEL_TINY, 31ull*MB },
{ MODEL_BASE, 57ull*MB },
{ MODEL_SMALL, 181ull*MB },
{ MODEL_MEDIUM, 559ull*MB },
{ MODEL_LARGE, 1122ull*MB },
},
},
{ GGML_TYPE_Q4_2,
{
{ MODEL_TINY, 26ull*MB },
{ MODEL_BASE, 50ull*MB },
{ MODEL_SMALL, 154ull*MB },
{ MODEL_MEDIUM, 470ull*MB },
{ MODEL_LARGE, 940ull*MB },
},
},
{ GGML_TYPE_Q5_0, // TODO: fix
{
{ MODEL_TINY, 31ull*MB },
{ MODEL_BASE, 57ull*MB },
{ MODEL_SMALL, 181ull*MB },
{ MODEL_MEDIUM, 559ull*MB },
{ MODEL_LARGE, 1122ull*MB },
},
},
{ GGML_TYPE_Q5_1,
{
{ MODEL_TINY, 31ull*MB },
{ MODEL_BASE, 57ull*MB },
{ MODEL_SMALL, 181ull*MB },
{ MODEL_MEDIUM, 559ull*MB },
{ MODEL_LARGE, 1122ull*MB },
},
},
}; };
static const std::map<e_model, size_t> MEM_REQ_KV_SELF = { static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
@ -280,11 +337,11 @@ static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
}; };
static const std::map<e_model, size_t> MEM_REQ_ENCODE = { static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
{ MODEL_TINY, 6ull*MB }, { MODEL_TINY, 30ull*MB },
{ MODEL_BASE, 8ull*MB }, { MODEL_BASE, 38ull*MB },
{ MODEL_SMALL, 13ull*MB }, { MODEL_SMALL, 56ull*MB },
{ MODEL_MEDIUM, 22ull*MB }, { MODEL_MEDIUM, 74ull*MB },
{ MODEL_LARGE, 33ull*MB }, { MODEL_LARGE, 94ull*MB },
}; };
static const std::map<e_model, size_t> MEM_REQ_DECODE = { static const std::map<e_model, size_t> MEM_REQ_DECODE = {
@ -370,7 +427,7 @@ struct whisper_hparams {
int32_t n_text_head = 6; int32_t n_text_head = 6;
int32_t n_text_layer = 4; int32_t n_text_layer = 4;
int32_t n_mels = 80; int32_t n_mels = 80;
int32_t f16 = 1; int32_t ftype = 1;
}; };
// audio encoding layer // audio encoding layer
@ -592,7 +649,7 @@ struct whisper_state {
std::string path_model; // populated by whisper_init_from_file() std::string path_model; // populated by whisper_init_from_file()
#ifdef WHISPER_USE_COREML #ifdef WHISPER_USE_COREML
whisper_coreml_context * ctx_coreml; whisper_coreml_context * ctx_coreml = nullptr;
#endif #endif
// [EXPERIMENTAL] token-level timestamps data // [EXPERIMENTAL] token-level timestamps data
@ -640,7 +697,8 @@ struct whisper_context {
int64_t t_load_us = 0; int64_t t_load_us = 0;
int64_t t_start_us = 0; int64_t t_start_us = 0;
ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16) ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
whisper_model model; whisper_model model;
whisper_vocab vocab; whisper_vocab vocab;
@ -697,7 +755,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) {
const ggml_type wtype = cache.k->type; const ggml_type wtype = cache.k->type;
WHISPER_ASSERT(wtype == cache.v->type); WHISPER_ASSERT(wtype == cache.v->type);
WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype)); WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));
struct ggml_init_params params = { struct ggml_init_params params = {
/*.mem_size =*/ cache.buf.size(), /*.mem_size =*/ cache.buf.size(),
@ -770,7 +828,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
read_safe(loader, hparams.n_text_head); read_safe(loader, hparams.n_text_head);
read_safe(loader, hparams.n_text_layer); read_safe(loader, hparams.n_text_layer);
read_safe(loader, hparams.n_mels); read_safe(loader, hparams.n_mels);
read_safe(loader, hparams.f16); read_safe(loader, hparams.ftype);
assert(hparams.n_text_state == hparams.n_audio_state); assert(hparams.n_text_state == hparams.n_audio_state);
@ -794,11 +852,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
model.type = e_model::MODEL_LARGE; model.type = e_model::MODEL_LARGE;
} }
// for the big tensors, we have the option to store the data in 16-bit floats // for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation // in order to save memory and also to speed up the computation
wctx.wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32; wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
if (wctx.wtype == GGML_TYPE_COUNT) {
fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
return false;
}
const size_t scale = model.hparams.f16 ? 1 : 2; const size_t scale = model.hparams.ftype ? 1 : 2;
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx); fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
@ -810,7 +872,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head); fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer); fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels); fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
fprintf(stderr, "%s: type = %d\n", __func__, model.type); fprintf(stderr, "%s: type = %d\n", __func__, model.type);
// print memory requirements // print memory requirements
@ -821,7 +883,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
MEM_REQ_SCRATCH1.at(model.type) + MEM_REQ_SCRATCH1.at(model.type) +
MEM_REQ_SCRATCH2.at(model.type) + MEM_REQ_SCRATCH2.at(model.type) +
MEM_REQ_SCRATCH3.at(model.type) + MEM_REQ_SCRATCH3.at(model.type) +
scale*MEM_REQ_MODEL.at (model.type) + scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) +
scale*MEM_REQ_KV_CROSS.at(model.type) + scale*MEM_REQ_KV_CROSS.at(model.type) +
scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)); scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
@ -837,7 +899,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
// always have at least one decoder // always have at least one decoder
wctx.model.buf = new std::vector<uint8_t>(); wctx.model.buf = new std::vector<uint8_t>();
wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(model.type)); wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type));
// we skip initialization of the state until it is needed // we skip initialization of the state until it is needed
// because it might be that state will always be provided externally. // because it might be that state will always be provided externally.
@ -928,6 +990,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
size_t ctx_size = 0; size_t ctx_size = 0;
const ggml_type wtype = wctx.wtype; const ggml_type wtype = wctx.wtype;
const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
{ {
const auto & hparams = model.hparams; const auto & hparams = model.hparams;
@ -946,92 +1009,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
// encoder // encoder
{ {
ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe; ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;
ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype); // e_conv_1_w ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype); // e_conv_1_w
ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_1_b ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b
ctx_size += 3*n_audio_state*n_audio_state*ggml_type_size(wtype); // e_conv_2_w ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype); // e_conv_2_w
ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_2_b ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b
ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_w; ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_b; ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
} }
// decoder // decoder
{ {
ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe; ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;
ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te; ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;
ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_w; ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_b; ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
} }
// encoder layers // encoder layers
{ {
ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_0_w ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_0_w
ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_1_w ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_1_w
ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_q_w ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_q_w
ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_k_w ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_v_w ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_v_w
ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_ln_1_w ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_ln_1_w
ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
} }
// decoder layers // decoder layers
{ {
ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_0_w ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_0_w
ctx_size += n_text_layer*( 4*n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b ctx_size += n_text_layer*( 4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_1_w ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_1_w
ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_q_w ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_q_w
ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_k_w ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_v_w ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_v_w
ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_ln_1_w ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_ln_1_w
ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
// //
ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_w ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_b ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_q_w ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_q_w
ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_q_b ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_k_w ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_v_w ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_v_w
ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_v_b ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b
ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_ln_1_w ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_ln_1_w
ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
} }
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
@ -1079,10 +1142,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
{ {
model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx); model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
model.e_conv_1_w = ggml_new_tensor_3d(ctx, wtype, 3, n_mels, n_audio_state); model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state);
model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state); model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
model.e_conv_2_w = ggml_new_tensor_3d(ctx, wtype, 3, n_audio_state, n_audio_state); model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state);
model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state); model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
@ -1259,11 +1322,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
while (true) { while (true) {
int32_t n_dims; int32_t n_dims;
int32_t length; int32_t length;
int32_t ftype; int32_t ttype;
read_safe(loader, n_dims); read_safe(loader, n_dims);
read_safe(loader, length); read_safe(loader, length);
read_safe(loader, ftype); read_safe(loader, ttype);
if (loader->eof(loader->context)) { if (loader->eof(loader->context)) {
break; break;
@ -1298,9 +1361,9 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
return false; return false;
} }
const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t); const size_t bpe = ggml_type_size(ggml_type(ttype));
if (nelements*bpe != ggml_nbytes(tensor)) { if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe); __func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false; return false;
@ -1309,7 +1372,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
loader->read(loader->context, tensor->data, ggml_nbytes(tensor)); loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
BYTESWAP_TENSOR(tensor); BYTESWAP_TENSOR(tensor);
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
total_size += ggml_nbytes(tensor); total_size += ggml_nbytes(tensor);
model.n_loaded++; model.n_loaded++;
} }
@ -1385,9 +1448,15 @@ static bool whisper_encode_internal(
} }
} }
#ifndef WHISPER_USE_COREML
struct ggml_tensor * cur; struct ggml_tensor * cur;
#ifndef WHISPER_USE_COREML
const bool use_coreml = false;
#else
const bool use_coreml = wstate.ctx_coreml != nullptr;
#endif
if (!use_coreml) {
// convolution + gelu // convolution + gelu
{ {
wstate.use_buf(ctx0, 1); wstate.use_buf(ctx0, 1);
@ -1502,14 +1571,14 @@ static bool whisper_encode_internal(
ggml_permute(ctx0, ggml_permute(ctx0,
ggml_cpy(ctx0, ggml_cpy(ctx0,
Qcur, Qcur,
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)), ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
0, 2, 1, 3); 0, 2, 1, 3);
struct ggml_tensor * K = struct ggml_tensor * K =
ggml_permute(ctx0, ggml_permute(ctx0,
ggml_cpy(ctx0, ggml_cpy(ctx0,
Kcur, Kcur,
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)), ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
0, 2, 1, 3); 0, 2, 1, 3);
struct ggml_tensor * V = struct ggml_tensor * V =
@ -1519,7 +1588,7 @@ static bool whisper_encode_internal(
Vcur, Vcur,
n_state/n_head, n_head, n_ctx), n_state/n_head, n_head, n_ctx),
1, 2, 0, 3), 1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)); ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false); struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
#else #else
@ -1534,7 +1603,7 @@ static bool whisper_encode_internal(
ggml_permute(ctx0, ggml_permute(ctx0,
ggml_cpy(ctx0, ggml_cpy(ctx0,
Kcur, Kcur,
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)), ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
0, 2, 1, 3); 0, 2, 1, 3);
// K * Q // K * Q
@ -1548,26 +1617,17 @@ static bool whisper_encode_internal(
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled); struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
//struct ggml_tensor * V_trans =
// ggml_permute(ctx0,
// ggml_cpy(ctx0,
// Vcur,
// ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
// 1, 2, 0, 3);
//struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
struct ggml_tensor * V = struct ggml_tensor * V =
ggml_cpy(ctx0, ggml_cpy(ctx0,
ggml_permute(ctx0, ggml_permute(ctx0,
ggml_reshape_3d(ctx0, ggml_reshape_3d(ctx0,
Vcur, Vcur,
n_state/n_head, n_head, n_ctx), n_state/n_head, n_head, n_ctx),
0, 2, 1, 3), 1, 2, 0, 3),
ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head) ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
); );
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
#endif #endif
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
@ -1622,7 +1682,7 @@ static bool whisper_encode_internal(
wstate.use_buf(ctx0, 0); wstate.use_buf(ctx0, 0);
cur = ggml_flash_ff(ctx0, cur = ggml_flash_ff(ctx0,
ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)), ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b); layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
#else #else
wstate.use_buf(ctx0, 0); wstate.use_buf(ctx0, 0);
@ -1693,12 +1753,16 @@ static bool whisper_encode_internal(
//ggml_graph_print(&gf); //ggml_graph_print(&gf);
} }
#else }
#ifdef WHISPER_USE_COREML
else
{
wstate.use_buf(ctx0, -1); wstate.use_buf(ctx0, -1);
struct ggml_tensor * cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data); whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
}
#endif #endif
// cur // cur
@ -2356,11 +2420,7 @@ static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float>
sum += fft_out[k] * filters.data[j * n_fft + k]; sum += fft_out[k] * filters.data[j * n_fft + k];
} }
if (sum < 1e-10) { sum = log10(std::max(sum, 1e-10));
sum = 1e-10;
}
sum = log10(sum);
mel.data[j * mel.n_len + i] = sum; mel.data[j * mel.n_len + i] = sum;
} }
@ -2540,9 +2600,9 @@ static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
struct whisper_state * whisper_init_state(whisper_context * ctx) { struct whisper_state * whisper_init_state(whisper_context * ctx) {
whisper_state * state = new whisper_state; whisper_state * state = new whisper_state;
const size_t scale = ctx->model.hparams.f16 ? 1 : 2; const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) { if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
delete state; delete state;
return nullptr; return nullptr;
@ -2553,7 +2613,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
} }
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->wtype, ctx->model.hparams.n_audio_ctx)) { if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__); fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
delete state; delete state;
return nullptr; return nullptr;
@ -2573,10 +2633,12 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str()); state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
if (!state->ctx_coreml) { if (!state->ctx_coreml) {
fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str()); fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
#ifndef WHISPER_COREML_ALLOW_FALLBACK
return nullptr; return nullptr;
} #endif
} else {
fprintf(stderr, "%s: Core ML model loaded\n", __func__); fprintf(stderr, "%s: Core ML model loaded\n", __func__);
}
#endif #endif
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx); state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
@ -2602,7 +2664,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) {
} }
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) { struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
whisper_model_loader loader = {};
fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model); fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
@ -2612,6 +2673,8 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model
return nullptr; return nullptr;
} }
whisper_model_loader loader = {};
loader.context = &fin; loader.context = &fin;
loader.read = [](void * ctx, void * output, size_t read_size) { loader.read = [](void * ctx, void * output, size_t read_size) {
@ -2647,10 +2710,11 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t
}; };
buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 }; buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
whisper_model_loader loader = {};
fprintf(stderr, "%s: loading model from buffer\n", __func__); fprintf(stderr, "%s: loading model from buffer\n", __func__);
whisper_model_loader loader = {};
loader.context = &ctx; loader.context = &ctx;
loader.read = [](void * ctx, void * output, size_t read_size) { loader.read = [](void * ctx, void * output, size_t read_size) {
@ -2747,8 +2811,10 @@ void whisper_free_state(struct whisper_state * state)
} }
#ifdef WHISPER_USE_COREML #ifdef WHISPER_USE_COREML
if (state->ctx_coreml != nullptr) {
whisper_coreml_free(state->ctx_coreml); whisper_coreml_free(state->ctx_coreml);
state->ctx_coreml = nullptr; state->ctx_coreml = nullptr;
}
#endif #endif
delete state; delete state;
@ -2909,7 +2975,6 @@ int whisper_lang_id(const char * lang) {
fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang); fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang);
return -1; return -1;
} }
return g_lang.at(lang).first; return g_lang.at(lang).first;
} }
@ -3047,8 +3112,8 @@ int whisper_model_n_mels(struct whisper_context * ctx) {
return ctx->model.hparams.n_mels; return ctx->model.hparams.n_mels;
} }
int whisper_model_f16(struct whisper_context * ctx) { int whisper_model_ftype(struct whisper_context * ctx) {
return ctx->model.hparams.f16; return ctx->model.hparams.ftype;
} }
int whisper_model_type(struct whisper_context * ctx) { int whisper_model_type(struct whisper_context * ctx) {
@ -3303,15 +3368,15 @@ static void whisper_exp_compute_token_level_timestamps(
// trim from start (in place) // trim from start (in place)
static inline void ltrim(std::string &s) { static inline void ltrim(std::string &s) {
s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) { s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), [](unsigned char ch) {
return !std::isspace(ch); return std::isspace(ch);
})); }));
} }
// trim from end (in place) // trim from end (in place)
static inline void rtrim(std::string &s) { static inline void rtrim(std::string &s) {
s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { s.erase(std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) {
return !std::isspace(ch); return std::isspace(ch);
}).base(), s.end()); }).base(), s.end());
} }
@ -3844,7 +3909,7 @@ int whisper_full_with_state(
} }
const int seek_start = params.offset_ms/10; const int seek_start = params.offset_ms/10;
const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len_from_state(state) : params.duration_ms/10); const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
// if length of spectrogram is less than 1s (100 samples), then return // if length of spectrogram is less than 1s (100 samples), then return
// basically don't process anything that is less than 1s // basically don't process anything that is less than 1s
@ -4823,23 +4888,32 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float) // when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256); std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
// put a bunch of random data in the buffer
for (size_t i = 0; i < buf.size(); i++) buf[i] = i; for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
for (int j = 0; j < (int) sizes.size(); j++) { for (int j = 0; j < (int) sizes.size(); j++) {
int n_q4_0 = 0;
int n_q4_1 = 0;
int n_fp16 = 0; int n_fp16 = 0;
int n_fp32 = 0; int n_fp32 = 0;
// GFLOPS/s // GFLOPS/s
double s_q4_0 = 0.0;
double s_q4_1 = 0.0;
double s_fp16 = 0.0; double s_fp16 = 0.0;
double s_fp32 = 0.0; double s_fp32 = 0.0;
const size_t N = sizes[j]; const size_t N = sizes[j];
for (int k = 0; k < 2; ++k) { for (int k = 0; k < 4; ++k) {
const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32; const ggml_type wtype =
k == 0 ? GGML_TYPE_Q4_0 :
k == 1 ? GGML_TYPE_Q4_1 :
k == 2 ? GGML_TYPE_F16 :
GGML_TYPE_F32;
double & s = k == 0 ? s_fp16 : s_fp32; double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32;
int & n = k == 0 ? n_fp16 : n_fp32; int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32;
struct ggml_init_params gparams = { struct ggml_init_params gparams = {
/*.mem_size =*/ buf.size(), /*.mem_size =*/ buf.size(),
@ -4883,8 +4957,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
s = ((2.0*N*N*N*n)/tsum)*1e-9; s = ((2.0*N*N*N*n)/tsum)*1e-9;
} }
snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n", snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n",
N, N, s_fp16, n_fp16, s_fp32, n_fp32); N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32);
s += strbuf; s += strbuf;
} }

View File

@ -258,7 +258,7 @@ extern "C" {
WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx); WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx); WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx); WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx);
WHISPER_API int whisper_model_f16 (struct whisper_context * ctx); WHISPER_API int whisper_model_ftype (struct whisper_context * ctx);
WHISPER_API int whisper_model_type (struct whisper_context * ctx); WHISPER_API int whisper_model_type (struct whisper_context * ctx);
// Token logits obtained from the last call to whisper_decode() // Token logits obtained from the last call to whisper_decode()